def url2txt(url, outEnc='euc-kr'): srcType, srcEnc, srcHtml = geturl.geturl(url) print srcEnc soup = BeautifulSoup.BeautifulSoup(srcHtml, fromEncoding=srcEnc[0], convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES) txt = getOnlyText(soup, outEnc) return txt
def Download(url): try: # 实例化一个geturl类geturl_instance,用geturl方法获取url url_instance = geturl(url) pdf_link = url_instance.GetUrl() # 实例化一个getpaper类getpaper_instance,用getpaper方法下载文献后并返回文献路径 paper_instance = getpaper(pdf_link) paper_path = paper_instance.GetPaper() return paper_path except: return False
def __init__(self, query, debug=0): """ get HTML contents at a given url 'urlstr' """ self.debug = debug urlstr = "http://images.google.com/images?svnum=10&hl=en&gbv=2&q=%s" % (query) self.geturl = geturl.geturl(urlstr) if debug: print '### DATA size:', len(self.geturl.data) self.getimglist(self.geturl.data)
def getText(self): self.srcType, self.srcEnc, self.srcHtml = geturl.geturl(self.srcUrl) #self.srcHtml = self.srcHtml.replace("<br>","\n") #self.srcHtml = self.srcHtml.replace("<br/>","\n") print self.srcEnc[0] self.soup = BeautifulSoup.BeautifulSoup(self.srcHtml, fromEncoding=self.srcEnc[0], convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES) self.text = getOnlyText(self.soup, self.outEnc) return self.text
def Sendmail(url, email): try: receiver = [email] # 实例化一个geturl类geturl_instance,用geturl方法获取url url_instance = geturl(url) pdf_link = url_instance.GetUrl() # 实例化一个getpaper类getpaper_instance,用getpaper方法下载文献后并返回文献路径作为邮箱附件 paper_instance = getpaper(pdf_link) paper_path = paper_instance.GetPaper() # 发送邮件 attachment = paper_path sendmail.SendEmail(receiver,subject,content,attachment) return paper_path except: return False
def __init__(self, urlstr, debug=0): """ get HTML contents at a given url 'urlstr' """ self.debug = debug self.geturl = geturl.geturl(urlstr) if debug: print '### DATA size:', len(self.geturl.data) self.parser = hparser.hparser(self.geturl.baseurl, debug=1) self.parser.feed( self.geturl.data ) self.parser.close() if debug: print '### Got :', len(self.parser.data) print self.parser.data self.parser.analyze() print '#'*50,'\n'
for i in xrange(curidx-1, -1, -1): if self.good[i][1] < curdepth: return i return -1 def info(self): print self.data def list(self): for url,ref in self.anchors: print "%s -- [%s]" % (url, ref) if __name__ == "__main__": import geturl import sys #baseurl = "http://www.python.org" baseurl = "http://www.naver.com" if len(sys.argv) > 1: baseurl = sys.argv[1] html = geturl.geturl(baseurl) h2t = html2txt(baseurl, debug=0) h2t.feed( html.data) print '-'*50 h2t.info() #h2t.list()
# return index. return -1 if not found for i in xrange(curidx - 1, -1, -1): if self.good[i][1] < curdepth: return i return -1 def info(self): print self.data def list(self): for url, ref in self.anchors: print "%s -- [%s]" % (url, ref) if __name__ == "__main__": import geturl import sys #baseurl = "http://www.python.org" baseurl = "http://www.naver.com" if len(sys.argv) > 1: baseurl = sys.argv[1] html = geturl.geturl(baseurl) h2t = html2txt(baseurl, debug=0) h2t.feed(html.data) print '-' * 50 h2t.info() #h2t.list()
import getlist, geturl, daili, downloadshipin_dange, hebing_ts, getm3u8, datetime url_yemian = daili.url_yemian headers_page = daili.headers_page headers_m3u8 = daili.headers_m3u8 x = geturl.geturl(url_yemian, headers_page) print('x[0]:', x[0]) print('x[1]:', x[1]) for xuhao in range(0, len(x[0])): url_dangeshipin = getlist.get_xiazai_url(x[0][xuhao]) getm3u8.getm3u8(url_dangeshipin, headers_m3u8) start = datetime.datetime.now().replace(microsecond=0) downloadshipin_dange.download_file(url_dangeshipin) end = datetime.datetime.now().replace(microsecond=0) print(end - start) # print('kaishihebing') print('xuhao[1][xuhao]:', x[1][xuhao]) hebing_ts.before_merge(x[1][xuhao])
from getContent import getContent from geturl import geturl geturl()
# shell_str = '+'.join(tmp) # # print(shell_str) # shell_str = 'copy /b ' + shell_str + str(geturl.geturl(url2)[1])+ ' 5.mp4' + '\n' + 'del *.ts' # return shell_str # # # def wite_to_file(cmdString): # cwd = os.getcwd() # 获取当前目录即dir目录下 # print("------------------------current working directory------------------" + cwd) # f = open("combined.cmd", 'w') # f.write(cmdString) # f.close() if __name__ == '__main__': url2 = 'https://www.bylj5a9019w0ccl9u8j88983w23.xyz:52789/index.php/vod/type/id/1.html' x = geturl.geturl(url2) print('x[0]:', x[0]) print('x[1]:', x[1]) for i in x[0]: u = getlist.getlist(i) print('u:', u) url = u[0] print('url:', url) namepian = u[1] # print(i[0],i[1]) start = datetime.datetime.now().replace(microsecond=0) download_file(url, namepian) end = datetime.datetime.now().replace(microsecond=0) print(end - start) # 结束下载
def main(days=1, url=None): """ Retrieves and formats data from the current SPC forecast online. Also able to get information from any archived forecast URL. """ if url is None: logging.info("No URL provided, extracting most recent available forecast.") day = geturl.geturl(days)[0] url = geturl.geturl(days)[1] print(day) else: logging.info("URL provided, attempting to extract archived forecast.") day = days url = url text = urllib.request.urlopen(url).read().decode('utf-8') text_array = list(filter(lambda a: a != '', text.split('\n'))) logging.info("Forecast successfully retrieved.") if day == 1: torn = text_array.index('... TORNADO ...') hail = text_array.index('... HAIL ...') wind = text_array.index('... WIND ...') cate = text_array.index('... CATEGORICAL ...') end = text_array[cate:].index("&&") coords = {"tornado": text_array[torn+1:hail-1], "hail": text_array[hail+1:wind-1], "wind": text_array[wind+1:cate-2], "categorical": text_array[cate+1:cate+end]} probs = {"tornado": ["0.02", "0.05", "0.10", "0.15", "0.30", "0.45", "0.60", "SIGN"], "hail": ["0.05", "0.15", "0.30", "0.45", "0.60", "SIGN"], "wind": ["0.05", "0.15", "0.30", "0.45", "0.60", "SIGN"], "categorical": ["TSTM", "MRGL", "SLGT", "ENH", "MDT", "HIGH"]} logging.info('Day 1 outlook coordinates stored.') elif day == 2 or day == 3: severe = text_array.index('... ANY SEVERE ...') cate = text_array.index('... CATEGORICAL ...') end = text_array[cate:].index("&&") coords = {"severe": text_array[severe+1:cate-2], "categorical": text_array[cate+1:cate+end]} probs = {"severe": ["0.05", "0.15", "0.30", "0.45", "0.60", "SIGN"], "categorical": ["TSTM", "MRGL", "SLGT", "ENH ", "MOD ", "HIGH"]} logging.info('Day {} outlook coordinates stored.'.format(day)) elif day == 48: severe = text_array.index('... ANY SEVERE ...') end = text_array[severe:].index("&&") coords = {"severe": text_array[severe+1:severe+end]} probs = {"severe": ['D4', 'D5', 'D6', 'D7', 'D8']} logging.info('Days 4-8 outlook coordinates stored.') forecast_object = { "day": days, "coords": coords, "probs": probs } logging.info("Forecast object created.") return forecast_object
sql = "SELECT enterpriseName FROM temp_icp_web2 where autoID = %s" cursor.execute(sql, int(str[0])) result = cursor.fetchone() print(str[0], '内容获取...') str.append(getContent(str[1], 1)) print(str[0], '爬取完成,读入数据库...') sql = "INSERT INTO Content (id,company,url,content) VALUES (%s,%s,%s,%s)" cursor.execute(sql, (int(str[0]), result[0], str[1], str[2])) db.commit() print(str[0], '读入成功') db.close() if __name__ == '__main__': db = pymysql.connect("localhost", "root", "123456", "testdb") cursor = db.cursor() sql = "truncate table content" cursor.execute(sql) db.close() po = Pool(8) q = Manager().Queue() list = geturl() for key, value in list.items(): q.put(str(key) + ' ' + value) for i in range(8): po.apply_async(ToMysql, (q, )) po.close() po.join() ''' print(getContent('99999',"http://www.nec-pbx.com/",1)) '''