コード例 #1
0
ファイル: url2txt.py プロジェクト: dongsam/works
def url2txt(url, outEnc='euc-kr'):
	srcType, srcEnc, srcHtml = geturl.geturl(url)
	
	print srcEnc
	soup = BeautifulSoup.BeautifulSoup(srcHtml, fromEncoding=srcEnc[0], convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
	txt = getOnlyText(soup, outEnc)
	return txt
コード例 #2
0
ファイル: execute.py プロジェクト: wangzhehyd/paperpy
def Download(url):
    try:
         # 实例化一个geturl类geturl_instance,用geturl方法获取url
         url_instance = geturl(url)
         pdf_link = url_instance.GetUrl()
         # 实例化一个getpaper类getpaper_instance,用getpaper方法下载文献后并返回文献路径
         paper_instance = getpaper(pdf_link)
         paper_path = paper_instance.GetPaper()
         return paper_path
    except:
         return False
コード例 #3
0
ファイル: getgoogleimgs.py プロジェクト: dongsam/works
	def __init__(self, query, debug=0):
		"""
		get HTML contents at a given url 'urlstr'
		"""			
		self.debug = debug
		urlstr = "http://images.google.com/images?svnum=10&hl=en&gbv=2&q=%s" % (query)
		self.geturl = geturl.geturl(urlstr)
		if debug: 
			print '### DATA size:', len(self.geturl.data)
			
		self.getimglist(self.geturl.data)
コード例 #4
0
ファイル: url2txt.py プロジェクト: dongsam/works
	def getText(self):
		self.srcType, self.srcEnc, self.srcHtml = geturl.geturl(self.srcUrl)
		
		#self.srcHtml = self.srcHtml.replace("<br>","\n")
		#self.srcHtml = self.srcHtml.replace("<br/>","\n")

		print self.srcEnc[0]
		self.soup = BeautifulSoup.BeautifulSoup(self.srcHtml, 
											fromEncoding=self.srcEnc[0], 
											convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
		self.text = getOnlyText(self.soup, self.outEnc)
		return self.text
コード例 #5
0
ファイル: execute.py プロジェクト: wangzhehyd/paperpy
def Sendmail(url, email):
    try:
         receiver = [email]
         # 实例化一个geturl类geturl_instance,用geturl方法获取url
         url_instance = geturl(url)
         pdf_link = url_instance.GetUrl()
         # 实例化一个getpaper类getpaper_instance,用getpaper方法下载文献后并返回文献路径作为邮箱附件
         paper_instance = getpaper(pdf_link)
         paper_path = paper_instance.GetPaper()
         # 发送邮件
         attachment = paper_path
         sendmail.SendEmail(receiver,subject,content,attachment)
         return paper_path
    except:
         return False
コード例 #6
0
ファイル: myagent.py プロジェクト: dongsam/works
	def __init__(self, urlstr, debug=0):
		"""
		get HTML contents at a given url 'urlstr'
		"""			
		self.debug = debug
		self.geturl = geturl.geturl(urlstr)
		if debug: 
			print '### DATA size:', len(self.geturl.data)

		self.parser = hparser.hparser(self.geturl.baseurl, debug=1)
		self.parser.feed( self.geturl.data )
		self.parser.close()
		if debug: 
			print '### Got :', len(self.parser.data)
			print self.parser.data
		self.parser.analyze()	
		print '#'*50,'\n'
コード例 #7
0
ファイル: html2txt.py プロジェクト: dongsam/works
		for i in xrange(curidx-1, -1, -1):
			if self.good[i][1] < curdepth:
				return i
		return -1


	def info(self):
		print self.data

	def list(self):
		for url,ref in self.anchors:
			print "%s -- [%s]" % (url, ref)


if __name__ == "__main__":
	import geturl
	import sys

	#baseurl = "http://www.python.org"
	baseurl = "http://www.naver.com"
	if len(sys.argv) > 1: baseurl = sys.argv[1]

	html = geturl.geturl(baseurl)

	h2t = html2txt(baseurl, debug=0)
	
	h2t.feed( html.data)
	print '-'*50
	h2t.info()
	#h2t.list()
コード例 #8
0
        # return index. return -1 if not found
        for i in xrange(curidx - 1, -1, -1):
            if self.good[i][1] < curdepth:
                return i
        return -1

    def info(self):
        print self.data

    def list(self):
        for url, ref in self.anchors:
            print "%s -- [%s]" % (url, ref)


if __name__ == "__main__":
    import geturl
    import sys

    #baseurl = "http://www.python.org"
    baseurl = "http://www.naver.com"
    if len(sys.argv) > 1: baseurl = sys.argv[1]

    html = geturl.geturl(baseurl)

    h2t = html2txt(baseurl, debug=0)

    h2t.feed(html.data)
    print '-' * 50
    h2t.info()
    #h2t.list()
コード例 #9
0
import getlist, geturl, daili, downloadshipin_dange, hebing_ts, getm3u8, datetime

url_yemian = daili.url_yemian

headers_page = daili.headers_page
headers_m3u8 = daili.headers_m3u8
x = geturl.geturl(url_yemian, headers_page)
print('x[0]:', x[0])
print('x[1]:', x[1])

for xuhao in range(0, len(x[0])):
    url_dangeshipin = getlist.get_xiazai_url(x[0][xuhao])
    getm3u8.getm3u8(url_dangeshipin, headers_m3u8)
    start = datetime.datetime.now().replace(microsecond=0)

    downloadshipin_dange.download_file(url_dangeshipin)

    end = datetime.datetime.now().replace(microsecond=0)
    print(end - start)
    # print('kaishihebing')
    print('xuhao[1][xuhao]:', x[1][xuhao])
    hebing_ts.before_merge(x[1][xuhao])
コード例 #10
0
from getContent import getContent
from geturl import geturl

geturl()
コード例 #11
0
#     shell_str = '+'.join(tmp)
#     # print(shell_str)
#     shell_str = 'copy /b ' + shell_str + str(geturl.geturl(url2)[1])+ ' 5.mp4' + '\n' + 'del *.ts'
#     return shell_str
#
#
# def wite_to_file(cmdString):
#     cwd = os.getcwd()  # 获取当前目录即dir目录下
#     print("------------------------current working directory------------------" + cwd)
#     f = open("combined.cmd", 'w')
#     f.write(cmdString)
#     f.close()

if __name__ == '__main__':
    url2 = 'https://www.bylj5a9019w0ccl9u8j88983w23.xyz:52789/index.php/vod/type/id/1.html'
    x = geturl.geturl(url2)
    print('x[0]:', x[0])
    print('x[1]:', x[1])
    for i in x[0]:
        u = getlist.getlist(i)
        print('u:', u)
        url = u[0]
        print('url:', url)
        namepian = u[1]
        # print(i[0],i[1])

        start = datetime.datetime.now().replace(microsecond=0)
        download_file(url, namepian)
        end = datetime.datetime.now().replace(microsecond=0)
        print(end - start)
    # 结束下载
コード例 #12
0
def main(days=1, url=None):
    """
    Retrieves and formats data from the current SPC forecast online.
    Also able to get information from any archived forecast URL.
    """

    if url is None:
        logging.info("No URL provided, extracting most recent available forecast.")
        day = geturl.geturl(days)[0]
        url = geturl.geturl(days)[1]

        print(day)
    else:
        logging.info("URL provided, attempting to extract archived forecast.")
        day = days
        url = url

    text = urllib.request.urlopen(url).read().decode('utf-8')
    text_array = list(filter(lambda a: a != '', text.split('\n')))

    logging.info("Forecast successfully retrieved.")

    if day == 1:
        torn = text_array.index('... TORNADO ...')
        hail = text_array.index('... HAIL ...')
        wind = text_array.index('... WIND ...')
        cate = text_array.index('... CATEGORICAL ...')
        end = text_array[cate:].index("&&")

        coords = {"tornado":        text_array[torn+1:hail-1],
                  "hail":           text_array[hail+1:wind-1],
                  "wind":           text_array[wind+1:cate-2],
                  "categorical":    text_array[cate+1:cate+end]}

        probs = {"tornado":         ["0.02", "0.05", "0.10", "0.15", "0.30", "0.45", "0.60", "SIGN"],
                 "hail":            ["0.05", "0.15", "0.30", "0.45", "0.60", "SIGN"],
                 "wind":            ["0.05", "0.15", "0.30", "0.45", "0.60", "SIGN"],
                 "categorical":     ["TSTM", "MRGL", "SLGT", "ENH", "MDT", "HIGH"]}
        logging.info('Day 1 outlook coordinates stored.')

    elif day == 2 or day == 3:
        severe = text_array.index('... ANY SEVERE ...')
        cate = text_array.index('... CATEGORICAL ...')
        end = text_array[cate:].index("&&")

        coords = {"severe":        text_array[severe+1:cate-2],
                  "categorical":   text_array[cate+1:cate+end]}

        probs = {"severe":        ["0.05", "0.15", "0.30", "0.45", "0.60", "SIGN"],
                 "categorical":   ["TSTM", "MRGL", "SLGT", "ENH ", "MOD ", "HIGH"]}

        logging.info('Day {} outlook coordinates stored.'.format(day))

    elif day == 48:
        severe = text_array.index('... ANY SEVERE ...')
        end = text_array[severe:].index("&&")

        coords = {"severe": text_array[severe+1:severe+end]}
        probs = {"severe": ['D4', 'D5', 'D6', 'D7', 'D8']}

        logging.info('Days 4-8 outlook coordinates stored.')

    forecast_object =  {
        "day": days,
        "coords": coords,
        "probs": probs
    }
    logging.info("Forecast object created.")

    return forecast_object
コード例 #13
0
ファイル: MMain.py プロジェクト: q2367018272/HomeWork11
        sql = "SELECT enterpriseName FROM temp_icp_web2 where autoID = %s"
        cursor.execute(sql, int(str[0]))
        result = cursor.fetchone()
        print(str[0], '内容获取...')
        str.append(getContent(str[1], 1))
        print(str[0], '爬取完成,读入数据库...')
        sql = "INSERT INTO Content (id,company,url,content) VALUES (%s,%s,%s,%s)"
        cursor.execute(sql, (int(str[0]), result[0], str[1], str[2]))
        db.commit()
        print(str[0], '读入成功')
    db.close()


if __name__ == '__main__':
    db = pymysql.connect("localhost", "root", "123456", "testdb")
    cursor = db.cursor()
    sql = "truncate table content"
    cursor.execute(sql)
    db.close()
    po = Pool(8)
    q = Manager().Queue()
    list = geturl()
    for key, value in list.items():
        q.put(str(key) + ' ' + value)
    for i in range(8):
        po.apply_async(ToMysql, (q, ))
    po.close()
    po.join()
'''
print(getContent('99999',"http://www.nec-pbx.com/",1))
'''