Example #1
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    conf = util.load_json('conf_name.json')
    for file_name in files:
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        if data['short'] not in conf.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = conf[data['short']]
        data['sub'] = {}
        for sub in subs:
            if sub not in conf.keys():
                continue
            html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = conf[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Example #2
0
def get_links(prefix, html):
    journals = util.find_journals(html)
    conferences = util.find_conferences(html)
    journals = list(set(journals))
    conferences = list(set(conferences))
    #authors = util.find_authors(html)
    links = []
    for j in journals:
        if j[0].startswith(prefix):
            print '\t', j[0]
            inner_url = 'http://dblp.uni-trier.de/db/journals/' + j[0]
            inner = util.get_page(inner_url)
            #links += get_links('###', inner)
        else:
            links.append(('journal', j[0].split('/')[0].split('#')[0]))
    for c in conferences:
        if c[0].startswith(prefix):
            print '\t', c[0]
            inner_url = 'http://dblp.uni-trier.de/db/conf/' + c[0]
            inner = util.get_page(inner_url)
            #links += get_links('###', inner)
        else:
            links.append(('conference', c[0].split('/')[0].split('#')[0]))
    #for a in authors:
    #links.append(('author', a[0].split('#')[0]))
        pass
    links = list(set(links))
    return links
Example #3
0
def get_links(prefix, html):
	journals = util.find_journals(html)
	conferences = util.find_conferences(html)
	journals = list(set(journals))
	conferences = list(set(conferences))
	#authors = util.find_authors(html)
	links = []
	for j in journals:
		if j[0].startswith(prefix):
			print '\t', j[0]
			inner_url = 'http://dblp.uni-trier.de/db/journals/' + j[0]
			inner = util.get_page(inner_url)
			#links += get_links('###', inner)
		else:
			links.append(('journal', j[0].split('/')[0].split('#')[0]))
	for c in conferences:
		if c[0].startswith(prefix):
			print '\t', c[0]
			inner_url = 'http://dblp.uni-trier.de/db/conf/' + c[0]
			inner = util.get_page(inner_url)
			#links += get_links('###', inner)
		else:
			links.append(('conference', c[0].split('/')[0].split('#')[0]))
	#for a in authors:
		#links.append(('author', a[0].split('#')[0]))
		pass
	links = list(set(links))
	return links
Example #4
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    conf = util.load_json('conf_name.json')
    for file_name in files:
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        if data['short'] not in conf.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = conf[data['short']]
        data['sub'] = {}
        for sub in subs:
            if sub not in conf.keys():
                continue
            html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = conf[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Example #5
0
def get_user_info(u,fp):
    ourl = "http://www.toutiao.com/c/user/%s/" % u
    url,html = util.get_page({"url":ourl})
    p,h = get_urlinfo(url)
    params = get_params("func.js")
    uid = get_userinfo(html)
    if params is not None and uid is not None:
        params = json.loads(params)
        params["user_id"] = uid
        path = "/c/user/article/"
        nurl = "%s//%s%s" % (p,h,path)
        count = 3
        while True:
            url,html = util.get_page({"url":nurl,"data":params,"method":"post"})
            if html is None or len(html) == 0:
                util.log_msg("could not get data from url:%s,data:%s,uid:%s" % (nurl,str(params),u))
                break
            mp = json.loads(html)
            if "data" in mp and isinstance(mp["data"],list):
                if len(mp["data"]) == 0:
                    util.log_msg("no data from response.url:%s" % nurl)
                result = []
                for item in mp["data"]:
                    turl = util.parse_url(url,item["source_url"])
                    try:
                        get_article(turl,url,item,fp,result)
                    except Exception:
                        tp, e,trace = sys.exc_info()
                        util.log_msg("get article(url:%s) info error:%s" % (turl,str(e)))
                if len(result) > 0:
                    if fp is None:
                        insert_into_db(result)
                    else:
                        for item in result:
                            fp.write("[%s]\t%s\t%s\n" % (time.ctime(),u,json.dumps(item)))
            else:
                util.log_msg("no data in content.url:%s" % nurl)
            if mp["has_more"]:
                params = get_params("func.js")
                params = json.loads(params)
                params["user_id"] = uid
                nxt = mp["next"]
                for key in nxt.keys():
                    params[key]=nxt[key]
            else:
                break
            count -= 1
            if count <= 0:
                break
    else:
        util.log_msg("could not parse data from html file,need to check this out.url:%s,referer:%s." % (ourl,referer))
Example #6
0
def get_journals():
    pos, cnt = 1, 0
    util.mkdir(JOURNAL_FOLDER)
    while True:
        html = util.get_page(JOURNAL_URL + str(pos))
        links = util.find_journals(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'journal'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data[
                'short']
            util.save_json(
                os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])),
                data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
            break
        pos += 100
        print 'Journal', cnt
Example #7
0
 def crawl_data5u(self):
     start_url = 'http://www.data5u.com/free/gngn/index.shtml'
     headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
         'Cache-Control':
         'max-age=0',
         'Connection':
         'keep-alive',
         'Cookie':
         'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
         'Host':
         'www.data5u.com',
         'Referer':
         'http://www.data5u.com/free/index.shtml',
         'Upgrade-Insecure-Requests':
         '1',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
     }
     html = get_page(start_url, options=headers)
     if html:
         ip_address = re.compile(
             '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>',
             re.S)
         re_ip_address = ip_address.findall(html)
         for address, port in re_ip_address:
             result = address + ':' + port
             yield result.replace(' ', '')
Example #8
0
def main(forum_post_number,
         version,
         api_url='https://wiki.factorio.com/api.php',
         version_nav=True):
    session = requests.Session()
    edit_token = get_edit_token(session, api_url)
    latest_version_page_name = 'Main_Page/Latest_versions'
    version_nav_page_name = 'Template:VersionNav'

    latest_version_page = get_page(session, api_url, latest_version_page_name)
    if version_nav:
        version_nav_page = get_page(session, api_url, version_nav_page_name)

    if version in latest_version_page:
        return f'Version {version} already found on "{latest_version_page_name}". Aborting.'
    if version_nav:
        if version in version_nav_page:
            return f'Version {version} already found on "{version_nav_page_name}". Aborting.'

    if 'None' not in latest_version_page:
        new_latest_version_page = re.sub(
            r'({{Translation\|Latest experimental version}}: \[https:\/\/forums\.factorio\.com\/)\d+ \d\.\d+\.\d+',
            rf'\g<1>{forum_post_number} {version}', latest_version_page)
    else:
        new_latest_version_page = re.sub(
            r'({{Translation\|Latest experimental version}}: ){{Translation\|None}}',
            rf'\g<1>[https://forums.factorio.com/{forum_post_number} {version}]',
            latest_version_page)
    if version_nav:
        new_version_nav_page = re.sub(
            r'(}}\n)(}}\n<noinclude>{{Documentation}}<\/noinclude>)',
            rf'\1* {{{{TransLink|Version history/{version[:version.rfind(".")+1]}0#{version}|{version}}}}}\n\2',
            version_nav_page)

    edit_response_latest_version_page = edit_page(session, api_url, edit_token,
                                                  latest_version_page_name,
                                                  new_latest_version_page,
                                                  f'{version}')
    if version_nav:
        edit_response_version_nav_page = edit_page(session, api_url,
                                                   edit_token,
                                                   version_nav_page_name,
                                                   new_version_nav_page,
                                                   f'{version}')

    return edit_response_latest_version_page.text + (
        ('\n' + edit_response_version_nav_page.text) if version_nav else '')
Example #9
0
 def crawl_89ip(self):
     start_url = 'http://www.89ip.cn/tqdl.html?num=50&address=&kill_address=&port=&kill_port=&isp='
     html = get_page(start_url)
     if html:
         find_ips = re.compile('(\d+\.\d+\.\d+\.\d+:\d+)', re.S)
         ip_ports = find_ips.findall(html)
         for address_port in ip_ports:
             yield address_port
Example #10
0
 def crawl_kxdaili(self):
     for i in range(1, 4):
         start_url = 'http://www.kxdaili.com/ipList/{}.html#ip'.format(i)
         html = get_page(start_url)
         ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
         re_ip_adress = ip_adress.findall(html)
         for adress, port in re_ip_adress:
             result = adress + ':' + port
             yield result.replace(' ', '')
Example #11
0
 def crawl_ip181(self):
     start_url = 'http://www.ip181.com/'
     html = get_page(start_url)
     ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
     # \s* 匹配空格
     re_ip_adress = ip_adress.findall(html)
     for adress, port in re_ip_adress:
         result = adress + ':' + port
         yield result.replace(' ', '')
Example #12
0
def get_journals():
    files = util.listdir(JOURNAL_FOLDER)
    cnt = 0
    jour = {}
    for file_name in files:
        data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
        html = util.get_page(data['url'])
        full_name = get_full_name(html)
        cnt += 1
        print cnt, len(files), data['short'], '|', full_name
        if '404' not in full_name:
            jour[data['short']] = full_name
            subs = get_subs(data['short'], html)
            for sub in subs:
                html = util.get_page('http://dblp.uni-trier.de/db/journals/' +
                                     sub)
                jour[sub] = get_full_name(html)
                print '\t', sub, jour[sub]
    util.save_json('jour_name.json', jour)
Example #13
0
 def crawl_premproxy(self):
     for i in ['China-01', 'China-02', 'China-03', 'China-04', 'Taiwan-01']:
         start_url = 'https://premproxy.com/proxy-by-country/{}.htm'.format(
             i)
         html = get_page(start_url)
         if html:
             ip_adress = re.compile('<td data-label="IP:port ">(.*?)</td>')
             re_ip_adress = ip_adress.findall(html)
             for adress_port in re_ip_adress:
                 yield adress_port.replace(' ', '')
 def __init__(self, shelf_id, username='', password=''):
     self.shelf = shelf_id
     self.usr = username
     self.pas = password
     self.first_page = bs4.BeautifulSoup(get_page(self.shelf, 1, LIST_VIEW), 'lxml')
     self.story_count = None
     self.pages = None
     self.stories = None
     self.perchap_wc = None
     self.wordcount = None
Example #15
0
 def crawl_kuaidaili(self):
     for page in range(1, 4):
         start_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(page)
         html = get_page(start_url)
         ip_adress = re.compile(
             '<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>'
         )
         re_ip_adress = ip_adress.findall(html)
         for adress, port in re_ip_adress:
             result = adress + ':' + port
             yield result.replace(' ', '')
Example #16
0
 def crawl_xicidaili(self):
     for page in range(1, 4):
         start_url = 'http://www.xicidaili.com/wt/{}'.format(page)
         html = get_page(start_url)
         ip_adress = re.compile(
             '<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>'
         )
         re_ip_adress = ip_adress.findall(html)
         for adress, port in re_ip_adress:
             result = adress + ':' + port
             yield result.replace(' ', '')
Example #17
0
 def crawl_ip3366(self):
     for page in range(1, 4):
         start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(
             page)
         html = get_page(start_url)
         ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
         # \s * 匹配空格,起到换行作用
         re_ip_address = ip_address.findall(html)
         for address, port in re_ip_address:
             result = address + ':' + port
             yield result.replace(' ', '')
Example #18
0
 def crawl_data5u(self):
     for i in ['gngn', 'gnpt']:
         start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i)
         html = get_page(start_url)
         ip_adress = re.compile(
             ' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>'
         )
         re_ip_adress = ip_adress.findall(html)
         for adress, port in re_ip_adress:
             result = adress + ':' + port
             yield result.replace(' ', '')
Example #19
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    cnt = 0
    conf = {}
    for file_name in files:
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        html = util.get_page(data['url'])
        full_name = get_full_name(html)
        cnt += 1
        try:
            print cnt, len(files), data['short'], '|', full_name
            if '404' not in full_name:
                conf[data['short']] = full_name
                subs = get_subs(data['short'], html)
                for sub in subs:
                    html = util.get_page('http://dblp.uni-trier.de/db/conf/' +
                                         sub)
                    conf[sub] = get_full_name(html)
                    print '\t', sub, conf[sub]
        except:
            pass
    util.save_json('conf_name.json', conf)
Example #20
0
 def crawl_daili66(self, page_count=4):
     start_url = 'http://www.66ip.cn/{}.html'
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         print('Crawling', url)
         html = get_page(url)
         if html:
             doc = pq(html)
             trs = doc('.containerbox table tr:gt(0)').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text()
                 port = tr.find('td:nth-child(2)').text()
                 yield ':'.join([ip, port])
Example #21
0
def get_journals():
    files = util.listdir(JOURNAL_FOLDER)
    util.mkdir(JOURNAL_CRALWED_FOLDER)
    cnt = 0
    for file_name in files:
        save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
        data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
        html = util.get_page(data['url'])
        full_name = get_full_name(html)
        data['name'] = full_name
        cnt += 1
        print cnt, len(files), data['short']
        data['links'] = get_links(data['short'], html)
Example #22
0
def get_journals():
	files = util.listdir(JOURNAL_FOLDER)
	util.mkdir(JOURNAL_CRALWED_FOLDER)
	cnt = 0
	for file_name in files:
		save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
		data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
		html = util.get_page(data['url'])
		full_name = get_full_name(html)
		data['name'] = full_name
		cnt += 1
		print cnt, len(files), data['short']
		data['links'] = get_links(data['short'], html)
Example #23
0
def get_authors():
    files = util.listdir(AUTHOR_FOLDER)
    util.mkdir(AUTHOR_CRALWED_FOLDER)
    for file_name in files:
        save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name))
        html = util.get_page(data['url'])
        full_name = get_full_name(html)
        data['name'] = full_name
        print data['short'], full_name
        data['links'] = get_links(data['short'], html)
        util.save_json(save_path, data)
Example #24
0
    def crawl_daili66(self, count=20):
        """
        获取代理66
        :param count:
        :return:
        """
        url = 'http://www.66ip.cn/mo.php?tqsl={}'.format(count)

        print('Crawling 66', url)
        html = get_page(url)
        if html:
            ret = re.findall(r'\d+\.\d+\.\d+\.\d+:\d+', html)
            for ip in ret:
                yield ip
Example #25
0
def get_authors():
	files = util.listdir(AUTHOR_FOLDER)
	util.mkdir(AUTHOR_CRALWED_FOLDER)
	for file_name in files:
		save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name))
		html = util.get_page(data['url'])
		full_name = get_full_name(html)
		data['name'] = full_name
		print data['short'], full_name
		data['links'] = get_links(data['short'], html)
		util.save_json(save_path, data)
Example #26
0
def get_journals():
    files = util.listdir(JOURNAL_FOLDER)
    util.mkdir(JOURNAL_CRALWED_FOLDER)
    cnt = 0
    jour = util.load_json('jour_name.json')
    for file_name in files:
        save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
        if data['short'] not in jour.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = jour[data['short']]
        data['sub'] = {}
        for sub in subs:
            html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = jour[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Example #27
0
def get_journals():
    files = util.listdir(JOURNAL_FOLDER)
    util.mkdir(JOURNAL_CRALWED_FOLDER)
    cnt = 0
    jour = util.load_json('jour_name.json')
    for file_name in files:
        save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
        if data['short'] not in jour.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = jour[data['short']]
        data['sub'] = {}
        for sub in subs:
            html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = jour[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Example #28
0
 def crawl_xroxy(self):
     for i in ['CN', 'TW']:
         start_url = 'http://www.xroxy.com/proxylist.php?country={}'.format(
             i)
         html = get_page(start_url)
         if html:
             ip_adress1 = re.compile(
                 "title='View this Proxy details'>\s*(.*).*")
             re_ip_adress1 = ip_adress1.findall(html)
             ip_adress2 = re.compile(
                 "title='Select proxies with port number .*'>(.*)</a>")
             re_ip_adress2 = ip_adress2.findall(html)
             for adress, port in zip(re_ip_adress1, re_ip_adress2):
                 adress_port = adress + ':' + port
                 yield adress_port.replace(' ', '')
Example #29
0
 def crawl_iphai(self):
     start_url = 'http://www.iphai.com/'
     html = get_page(start_url)
     if html:
         find_tr = re.compile('<tr>(.*?)</tr>', re.S)
         trs = find_tr.findall(html)
         for s in range(1, len(trs)):
             find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>',
                                  re.S)
             re_ip_address = find_ip.findall(trs[s])
             find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
             re_port = find_port.findall(trs[s])
             for address, port in zip(re_ip_address, re_port):
                 address_port = address + ':' + port
                 yield address_port.replace(' ', '')
Example #30
0
 def crawl_ip3366(self):
     for i in range(1, 4):
         start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
         html = get_page(start_url)
         if html:
             find_tr = re.compile('<tr>(.*?)</tr>', re.S)
             trs = find_tr.findall(html)
             for s in range(1, len(trs)):
                 find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                 re_ip_address = find_ip.findall(trs[s])
                 find_port = re.compile('<td>(\d+)</td>')
                 re_port = find_port.findall(trs[s])
                 for address, port in zip(re_ip_address, re_port):
                     address_port = address + ':' + port
                     yield address_port.replace(' ', '')
Example #31
0
 def crawl_kuaidaili(self):
     for i in range(1, 4):
         start_url = [
             'http://www.kuaidaili.com/free/{}/{}/'.format(style, i)
             for style in ['intr', 'inha']
         ]
         for url in start_url:
             html = get_page(url)
             if html:
                 ip_address = re.compile('<td data-title="IP">(.*?)</td>')
                 re_ip_address = ip_address.findall(html)
                 port = re.compile('<td data-title="PORT">(.*?)</td>')
                 re_port = port.findall(html)
                 for address, port in zip(re_ip_address, re_port):
                     address_port = address + ':' + port
                     yield address_port.replace(' ', '')
Example #32
0
def get_conferences():
	files = util.listdir(CONFERENCE_FOLDER)
	util.mkdir(CONFERENCE_CRALWED_FOLDER)
	cnt = 0
	for file_name in files:
		cnt += 1
		if cnt < 1970:
			continue
		save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
		data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
		html = util.get_page(data['url'])
		full_name = get_full_name(html)
		data['name'] = full_name
		try:
			print cnt, len(files), data['short']
		except:
			pass
		data['links'] = get_links(data['short'], html)
 def load_stories(self):
     if self.stories is None:
         print('Loading story urls for', self.shelf)
         s = []
         for page in range(self.pages):
             print('Loading page', page, 'out of', self.pages, 'for', self.shelf)
             soup = self.first_page if page == 0 else bs4.BeautifulSoup(get_page(self.shelf, page + 1, LIST_VIEW), 'lxml')
             bold_tags = soup(class_="search_results_count")[0]('b')
             from_ = int(bold_tags[0].string)
             to = int(bold_tags[1].string)
             # there are 1-60 stories on the first page which means 60, but 60-1=59 so we add one
             count = (to - from_) + 1
             story_list = soup(class_="story-list")[0]('li')
             for story in story_list:
                 s.append(story(class_="right")[0].h2.a['href'])
         self.stories = tuple(s)
         print(number_objects(len(self.stories), 'url(|s)'), 'loaded for', self.shelf)
     return self.stories
Example #34
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    for file_name in files:
        cnt += 1
        if cnt < 1970:
            continue
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        html = util.get_page(data['url'])
        full_name = get_full_name(html)
        data['name'] = full_name
        try:
            print cnt, len(files), data['short']
        except:
            pass
        data['links'] = get_links(data['short'], html)
Example #35
0
def cb():
    url = "http://esf.cq.fang.com"
    #url,html = util.get_page({"url":url})
    html = open("data/esf.html").read()
    entries = get_entry(html)
    fp = open("data/result.txt","a")
    for entry in entries:
        if entry[0] is None:
            continue
        eurl = util.parse_url(url,entry[0])
        html = open("data/yubei.html").read()
        eurl,html = util.get_page()
        subs = get_sub(html)
        for sub in subs:
            surl = util.parse_url(eurl,sub[0])
            if surl == eurl:
                continue
            html = open("data/list.html").read()
            nurl,items = get_list(html,surl)
    fp.close()
def get_conferences():
	files = util.listdir(CONFERENCE_FOLDER)
	util.mkdir(CONFERENCE_CRALWED_FOLDER)
	cnt = 0
	conf = util.load_json('conf_name.json')
	for file_name in files:
		save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
		if data['short'] not in conf.keys():
			continue
		html = util.get_page(data['url'])
		subs = get_subs(data['short'], html)
		data['name'] = conf[data['short']]
		data['sub'] = {}
		if len(subs) == 0:
			data['sub']['#'] = get_publications(html)
			util.save_json(save_path, data)
		cnt += 1
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    conf = util.load_json('conf_name.json')
    for file_name in files:
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        if data['short'] not in conf.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = conf[data['short']]
        data['sub'] = {}
        if len(subs) == 0:
            data['sub']['#'] = get_publications(html)
            util.save_json(save_path, data)
        cnt += 1
Example #38
0
def mail_fetch(mail_domain):
    regex_domain = mail_domain.replace(".", "\\.").replace("-", "\\-")
    this_regex = mail_regex + regex_domain

    keyword = '"@%s"' % mail_domain

    mails = []
    for result in search(keyword):
        logging.info(result)
        try:
            text = get_page(result)
        except:
            logging.info(traceback.format_exc())
            pass
        matchs = re.findall(this_regex, text)
        matchs = list(set(matchs))
        for match in matchs:
            logging.info(match)
            mails.append(match)

    return list(set(mails))
def get_journals():
    files = util.listdir(JOURNAL_FOLDER)
    util.mkdir(JOURNAL_CRALWED_FOLDER)
    cnt = 0
    jour = util.load_json('jour_name.json')
    for file_name in files:
        save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
        if data['short'] not in jour.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = jour[data['short']]
        data['sub'] = {}
        if len(subs) == 0:
            data['sub']['#'] = get_publications(html)
            util.save_json(save_path, data)
        cnt += 1
        print cnt, len(files), data['short']
def get_journals():
	files = util.listdir(JOURNAL_FOLDER)
	util.mkdir(JOURNAL_CRALWED_FOLDER)
	cnt = 0
	jour = util.load_json('jour_name.json')
	for file_name in files:
		save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
		if data['short'] not in jour.keys():
			continue
		html = util.get_page(data['url'])
		subs = get_subs(data['short'], html)
		data['name'] = jour[data['short']]
		data['sub'] = {}
		if len(subs) == 0:
			data['sub']['#'] = get_publications(html)
			util.save_json(save_path, data)
		cnt += 1
		print cnt, len(files), data['short']		
Example #41
0
def get_article(url,referer,data,fp,result2):
    url, h = util.get_page({"url":url,"headers":{"Referer":referer}})
    tree = etree.HTML(h)
    scripts = [o for o in tree.xpath("//script/text()") if o.find("BASE_DATA") > -1 or o.find("__pgcInfo")>-1]
    scripts.append("console.log(JSON.stringify(BASE_DATA))")
    open("data/tmp.js","w").write("\n".join(scripts))
    r = get_params("data/tmp.js")
    if r is not None:
        mp = json.loads(r)
        obj = {"entry":data,"data":mp}
        conf = [("title",["data","artilceInfo","title"]),
                ("content",["data","artilceInfo","content"],None,html.unescape),
                ("comments",["data","commentInfo","comments_count"],0),
                ("isOriginal",["data","artilceInfo","subInfo","isOriginal"],False),
                ("url",["__const",url]),
                ("views",["entry","go_detail_count"], 0),
                ("cover",["entry","image_url"],""),
                ("abstract",["entry","abstract"], ""),
                ("source",["data","artilceInfo","subInfo","source"],""),
                ("publishtime",["data","artilceInfo","subInfo","time"]),
                ("tags",["data","artilceInfo","tagInfo","tags"],"",lambda o:",".join([so["name"] for so in o])),
                ("category",["data","headerInfo","chineseTag"],""),
            ]
        result = {}
        for cf in conf:
            v = util.get_jpath(obj,cf[1],cf[2] if len(cf)>2 else None,cf[3] if len(cf)>3 else None)
            if v is not None:
                result[cf[0]] = v
        result["id"] = hashlib.md5(url.encode("utf-8")).hexdigest()
        if "content" in result:
            result["content"],result["images"] = replace_image(result["content"],url)
            if "cover" in result and len(result["cover"])>0:
                result["cover"] = imge_transfer(util.parse_url(url,result["cover"]),url)[1]
            if len(result) > 0:
                result2.append(result)
        else:
            util.log_msg("could not parse content from html file,need to check this out.url:%s,referer:%s." % (url,referer))
    else:
        util.log_msg("could not parse data from html file,need to check this out.url:%s,referer:%s." % (url,referer))
Example #42
0
def get_authors():
    pos, cnt = 545504, 0
    util.mkdir(AUTHOR_FOLDER)
    while True:
        html = util.get_page(AUTHOR_URL + str(pos))
        links = util.find_authors(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'author'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/pers/hd/a/' + data['short']
            util.save_json(os.path.join(AUTHOR_FOLDER, util.hex_hash(data['short'])), data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
        	break
        pos += 300
        print 'Author', pos, cnt
Example #43
0
def get_conferences():
    pos, cnt = 1, 0
    util.mkdir(CONFERENCE_FOLDER)
    while True:
        html = util.get_page(CONFERENCE_URL + str(pos))
        links = util.find_conferences(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'conference'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/db/conf/' + data['short']
            util.save_json(os.path.join(CONFERENCE_FOLDER, util.hex_hash(data['short'])), data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
        	break
        pos += 100
        print 'Conference', cnt
Example #44
0
def get_journals():
    pos, cnt = 1, 0
    util.mkdir(JOURNAL_FOLDER)
    while True:
        html = util.get_page(JOURNAL_URL + str(pos))
        links = util.find_journals(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'journal'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data['short']
            util.save_json(os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])), data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
        	break
        pos += 100
        print 'Journal', cnt
Example #45
0
def get_authors():
    pos, cnt = 545504, 0
    util.mkdir(AUTHOR_FOLDER)
    while True:
        html = util.get_page(AUTHOR_URL + str(pos))
        links = util.find_authors(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'author'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/pers/hd/a/' + data['short']
            util.save_json(
                os.path.join(AUTHOR_FOLDER, util.hex_hash(data['short'])),
                data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
            break
        pos += 300
        print 'Author', pos, cnt
Example #46
0
 def crawl_xicidaili(self):
     for i in range(1, 3):
         start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
         headers = {
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
             'Cookie':
             '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
             'Host': 'www.xicidaili.com',
             'Referer': 'http://www.xicidaili.com/nn/3',
             'Upgrade-Insecure-Requests': '1',
         }
         html = get_page(start_url, options=headers)
         if html:
             find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
             trs = find_trs.findall(html)
             for tr in trs:
                 find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                 re_ip_address = find_ip.findall(tr)
                 find_port = re.compile('<td>(\d+)</td>')
                 re_port = find_port.findall(tr)
                 for address, port in zip(re_ip_address, re_port):
                     address_port = address + ':' + port
                     yield address_port.replace(' ', '')
Example #47
0
def cb():
    '''
    url = "https://www.facebook.com/"
    url,html = util.get_page({"url":url})
    #print(html)
    ##print(type(html))
    open("data/facebook.html","w").write(html)
    #html = open("data/facebook.html").read()
    ts = int(time.time())
    lsd = re.findall("name=\"lsd\" value=\"[^\"]+\"", html)
    lsd = lsd[0].split("\"")[3] if len(lsd) > 0 else None
    lgnrnd = re.findall("name=\"lgnrnd\" value=\"[^\"]+\"",html)
    lgnrnd = lgnrnd[0].split("\"")[3] if len(lgnrnd) > 0 else None
    if lsd is None or lgnrnd is None:
        print("could not parse lsd")
        sys.exit(0)
    data = {
        "lsd":lsd,
        "email":"*****@*****.**",
        "pass":"******",
        "timezone":-480,
        "lgndim":"eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNTMsImMiOjI0fQ==",
        "lgnrnd":lgnrnd,
        "lgnjs":ts,
        "ab_test_data":"AA///AAAAAAAA/A/AAA/AAAAAAAAAAAAAAAAAAAAAAAAf//fA/DBAB",
        "locale":"zh_CN",
        "login_source":"login_bluebar",
        }
    url = "https://www.facebook.com/login.php?login_attempt=1&lwv=110"
    url,html = util.get_page({"url":url,"data":data})
    open("data/facebook2.html","w").write(html)
    #url = "https://www.facebook.com/100008346345446"
    url = "https://www.facebook.com/profile.php?id=100008346345446"
    url,html = util.get_page({"url":url})
    print(url)
    open("data/profile2.html","w").write(html)
    '''

    '''
    c_id = "100014233620831"
    uid = "100008346345446"
    ts = int(time.time())
    url = "https://www.facebook.com/profile.php?id=%s&lst=%s%%3A%s%%3A%d&sk=friends&source_ref=pb_friends_tl" % (uid,c_id,uid,ts)
    url,html = util.get_page({"url":url})
    print(url)
    open("data/friends.html","w").write(html)
    '''
    params = {
        "dpr":"1",
        "__user":"******",
        "__a":"1",
        "__dyn":"7AgNeyfyGmaxx2u6aEyx91qeCwKAKGgyi8zQC-C267UKewWhE98nwgUy22EaUgxebkwy8xa5WjzEgDKuEjKewExaFQ12VVojxCUSbAWCDxi5-78O5u5o5aayrhVo9ohxGbwYUmC-UjDQ6Evwwh8gUW5oy5EG2ut5xq48a9Ef8Cu4rGUpCzo-8Gm8z8O784afxK9yUvy8lUGdyU4eQEB0",
        "__af":"j0",
        "__req":"26",
        "__be":"-1",
        "__pc":"EXP4:DEFAULT",
        "__rev":"3161010",
        "__spin_r":"3161010",
        "__spin_b":"trunk",
        "__spin_t":"1500360303"
        }
    content = open("data/friends.html").read()
    data,info = get_info(content,None)
    params["data"] = data
    ts = int(time.time())
    params["__spin_t"] = ts
    url = "https://www.facebook.com/ajax/pagelet/generic.php/AllFriendsAppCollectionPagelet"
    url,html = util.get_page({"url":url,"data":params})
    print(url)
    open("data/friends_page.html","w").write(html)
Example #48
0
def get(url,fname):
    url,html = util.get_page({"url":url})
    open(fname,"w").write(html)