def requestAPIForURL(amount): html = crawlUtils.crawlWorker("http://finance.nbd.com.cn/", "Anon", 0)[0] lastArticleID = nbdCrawlMethod.GET_LA_REGEX.findall(html)[0] amount = float(amount) i = amount / 30 j = amount // 30 needPages = int(i) if i == j else int(i) + 1 result = [] for i in range(1, 1 + needPages): try: url = "http://finance.nbd.com.cn/columns/119?last_article=%s" % lastArticleID APIHTML = crawlUtils.crawlWorker(url, "Anon", 0)[0] links = ["http://www.nbd.com.cn/articles/%s.html" % x for x in nbdCrawlMethod.EXTRACT_LINKS_REGEX.findall(APIHTML)] lastArticleID = nbdCrawlMethod.GET_LA_REGEX.findall(APIHTML)[0] result += links except: pass return result
def requestAPIForURL(amount): amount = float(amount) i = amount / 30 j = amount // 30 needPages = int(i) if i == j else int(i) + 1 result = [] for i in range(1, 1 + needPages): APIURL = "http://www.dzzq.com.cn/list_111_%s.html" % i html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw'] links = dzzqCrawlMethod.GET_LINK_REGEX.findall(html) for j in links: if "list" not in j and j[0] != "S": result.append("http://www.dzzq.com.cn/finance/%s.html" % j) return result
def requestAPIForURL(amount): amount = float(amount) i = amount / 10 j = amount // 10 needPages = int(i) if i == j else int(i) + 1 result = [] for i in range(1, 1 + needPages): APIURL = "http://www.nifa.org.cn/nifa/2955675/2955761/a704445f/index%s.html" % i html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw'] links = nifaCrawlMethod.GET_LINK_REGEX.findall(html) for j in links: result.append( "http://www.nifa.org.cn/nifa/%s/%s/%s/index.html" % (j[0], j[1], j[2])) return result
def requestAPIForURL(amount): amount = float(amount) i = amount / 60 j = amount // 60 needPages = int(i) if i == j else int(i) + 1 result = [] for i in range(2, 2 + needPages): APIURL = "http://www.stockstar.com/roll/finance_%s.shtml" % i html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw'] \ .replace("<html><body><p>", "") \ .replace("</p></body></html>", "") links = zqzxCrawlMethod.GET_LINK_REGEX.findall(html) for i in links: if "list" not in i and i[0] != "S": result.append("https://finance.stockstar.com/%s.shtml" % i) return result
def requestAPIForURL(amount): amount = float(amount) i = amount / 10 j = amount // 10 needPages = int(i) if i == j else int(i) + 1 result = [] for i in range(1, 1 + needPages): APIURL = "https://www.tmtpost.com/httpsserver/common/get?url=/v1/lists/home&data=offset=%s" % ( i * 10) html = crawlUtils.crawlWorker(APIURL, "Anon", 0)['raw']\ .replace("<html><body><p>", "")\ .replace("</p></body></html>", "") print(html) jsonData = json.loads(html) for x in jsonData["data"]: if x["item_type"] == "post": result.append(x["short_url"]) return result
def getLastestPostID(): html = crawlUtils.crawlWorker("http://www.chinatimes.net.cn/", "Anon", 0)['raw'] return int(chinatimesCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
def getLastestPostID(): html = crawlUtils.crawlWorker("http://www.ctoutiao.com/", "Anon", 0)['raw'] return int(cttCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
def getLastestPostID(): html = crawlUtils.crawlWorker("http://www.ctsbw.com/article/", "Anon", 0)['raw'] return int(ctsbwCrawlMethod.EXTRACT_LATEST_RE.findall(html)[5])
def getLastestPostID(): html = crawlUtils.crawlWorker("https://36kr.com/", "Anon", 0)['raw'] return int(krCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
def getLastestPostID(): html = crawlUtils.crawlWorker("http://www.geekpark.net/", "Anon", 0)['raw'] return int(geekparkCrawlMethod.EXTRACT_LATEST_RE.findall(html)[1])
def getLastestPostID(): html = crawlUtils.crawlWorker( "http://www.cyzone.cn/content/index/init?tpl=index_page&page=1", "Anon", 0)['raw'] return int(cyzoneCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
def getLastestPostID(): html = crawlUtils.crawlWorker("https://news.pedata.cn/", "Anon", 0)['raw'] return int(pedataCrawlMethod.EXTRACT_LATEST_RE.findall(html)[0])
def getLastestPostID(): html = crawlUtils.crawlWorker("http://www.jpm.cn/", "Anon", 0)['raw'] return int(jpmCrawlMethod.EXTRACT_LATEST_RE.findall(html)[7])