Beispiel #1
0
def get_url_lists():
    result = set()

    njit = spider.get_html('http://www.njit.edu.cn')
    jwc = spider.get_html('http://jwc.njit.edu.cn')

    nc = spider.njit_catcher(njit)
    result.update(nc.get_all_link())

    time.sleep(1)
    jwcc = spider.jwc_catcher(jwc)
    result.update(jwcc.get_all_link())

    print(len(result), result)
    return result
Beispiel #2
0
    def get(self, request):
        query_value = request.GET.get('query', None)
        flag = request.GET.get('flag',None)
        pageNum=request.GET.get('pageNum',None)
        last_page=request.GET.get('last_page',None)

        if query_value and query_value != '':
            query_value_en = urlquote(query_value)
            print('query', query_value_en)
            if flag and pageNum and last_page:
                rh_list,page_num = get_html(query_value_en,flag=flag,pageNum=pageNum,last_page=last_page)
            else:
                rh_list,page_num = get_html(query_value_en)
            page_size,current_page,maxPageNum = paging_control(page_num,request)
            SearchHistory.save2SearchHistory(query_value)
            return render(request, 'search/result.html', {'rh_list': rh_list, 'query_value': query_value,
                                                          'last_page':page_size,'current_page':current_page,
                                                          })
        else:
            return render(request, 'search/index.html', {})
Beispiel #3
0
def get_publisher():
    clear_file()
    html = get_html('http://www.szu.edu.cn/board/userlist.asp', 'gb2312')
    needed_text_pattern = r'>\d+.*?</option>'
    all_needed_list = re.findall(needed_text_pattern, html)
    if not all_needed_list:
        raise Exception('can not find')
    left_tag, right_tag = (r'>\d+', r'</option>')
    publisher_list = []
    for msg_mixed_with_tags in all_needed_list:
        row_text = fing_content_between_two_marks(left_tag, right_tag,
                                                  msg_mixed_with_tags)
        row_text = row_text[1:]  # delete the prefix point
        publisher_list.append(row_text)
    return publisher_list
def get_publisher():
    clear_file()
    html = get_html('http://www.szu.edu.cn/board/userlist.asp', 'gb2312')
    needed_text_pattern = r'>\d+.*?</option>'
    all_needed_list = re.findall(needed_text_pattern, html)
    if not all_needed_list:
        raise Exception('can not find')
    left_tag, right_tag = (r'>\d+', r'</option>')
    publisher_list = []
    for msg_mixed_with_tags in all_needed_list:
        row_text = fing_content_between_two_marks(left_tag, right_tag,
                                                 msg_mixed_with_tags)
        row_text = row_text[1:]  # delete the prefix point
        publisher_list.append(row_text)
    return publisher_list
Beispiel #5
0
def search():
    keywords = request.args.get('q').split(" ")
    if request.args.get("pn") != None:
        page = int(request.args.get("pn"))
        print "aaaaaaaa"
    else:
        page = 1
    if page == 1:
        pageup_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page)
    else:
        pageup_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page - 1)
    pagedown_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page + 1)
    keywords = filter(lambda x: x != "", keywords)
    html = get_html(keywords, page).encode("utf-8")
    results = get_results(html)
    return render_template("search.html", results=results, pageup_url=pageup_url, pagedown_url=pagedown_url)
Beispiel #6
0
def search():
    keywords = request.args.get('q').split(" ")
    if request.args.get("pn") != None:
        page = int(request.args.get("pn"))
        print "aaaaaaaa"
    else:
        page = 1
    if page == 1:
        pageup_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page)
    else:
        pageup_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page - 1)
    pagedown_url = "/search?q=" + "+".join(keywords) + "&pn=" + str(page + 1)
    keywords = filter(lambda x: x != "", keywords)
    html = get_html(keywords, page).encode("utf-8")
    results = get_results(html)
    return render_template("search.html",
                           results=results,
                           pageup_url=pageup_url,
                           pagedown_url=pagedown_url)
Beispiel #7
0
def get_content(mail, proxies=None):
    send_list = []
    for url in get_url_lists():
        text = spider.get_html(url, proxy=proxies)
        if text is None:
            print(url + ' cannot load')
            continue
        if '无权访问' in text:
            print(url + ' can only access from local')
            continue

        if 'jwc' in url:
            parser = spider.jwc_parser(text)
        elif 'xinghuo' in url:
            parser = spider.xh_parser(text)
        elif 'www.njit' in url:
            parser = spider.njit_parser(text)
        else:
            print(url, 'cannot find parser')
            continue

        page_time = parser.get_time()
        if now_time == page_time:
            print(url, ' match time')
            # if True:
            title = parser.get_title()
            body = parser.get_body()

            send_list.append((page_time + " NJIT:" + title,
                              title + '\n' + body + '\n\n' + url))

        time.sleep(1)

    print(len(send_list), send_list)

    for send in send_list:
        title = send[0]
        body = send[1]

        mail.send_mail_to(title, body)
        time.sleep(1)
Beispiel #8
0
        self.get_njit_link()
        self.get_info_link()
        self.get_xh_link()
        return self.set

    def get_njit_link(self):
        p = 'href="content.jsp(.*?)"'
        return self.parse_link(p, 'http://jwc.njit.edu.cn/content.jsp')

    def get_info_link(self):
        p = 'href="http://www.njit.edu.cn/info(.*?)"'
        return self.parse_link(p, 'http://www.njit.edu.cn/info')

    def get_xh_link(self):
        p = 'href="http://xinghuo.njit.edu.cn/info(.*?)"'
        return self.parse_link(p, 'http://xinghuo.njit.edu.cn/info')


if __name__ == '__main__':
    # r = requests.get('http://www.njit.edu.cn')
    # r.encoding = 'utf-8'
    # catcher = njit_catcher(r.text)
    # links = catcher.get_all_link()
    # print(len(links), links)
    import spider

    text = spider.get_html('http://jwc.njit.edu.cn/')
    catcher = jwc_catcher(text)
    link = catcher.get_all_link()
    print(len(link), link)
Beispiel #9
0
def baidu_search():
    keyword = request.args.get('wd')
    return get_html(keyword)