コード例 #1
0
ファイル: ap_news.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
     self.cookies = {'cookie': '_cb_ls=1; _cb=ChGdwsejPcBwqK1A; _ga=GA1.2.1067424464.1556266698; __gads=ID=b2804ef9280ce726:T=1556266708:S=ALNI_MbsZp6KMsLTd9MAhzM98UpWqF4sEQ; __qca=P0-112096547-1556266838413; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; GED_PLAYLIST_ACTIVITY=W3sidSI6Ilp4Q0YiLCJ0c2wiOjE1NTY2MTc5NjcsIm52IjowLCJ1cHQiOjE1NTY2MTc5NjAsImx0IjoxNTU2NjE3OTYwfV0.; _gid=GA1.2.1304411157.1557027854; _cb_svref=null; OptanonConsent=landingPath=NotLandingPage&datestamp=Sun+May+05+2019+11%3A44%3A56+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=4.1.0&EU=false&groups=0_140011%3A1%2C1%3A1%2C0_140010%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2C0_140046%3A1%2C0_140042%3A1%2C0_140038%3A1%2C0_140034%3A1%2C0_140055%3A1%2C0_140051%3A1%2C0_140047%3A1%2C0_140043%3A1%2C0_140039%3A1%2C0_140035%3A1%2C0_140031%3A1%2C0_140052%3A1%2C0_140048%3A1%2C0_140044%3A1%2C0_140040%3A1%2C0_140036%3A1%2C0_140032%3A1%2C0_140053%3A1%2C0_140049%3A1%2C0_140045%3A1%2C0_140041%3A1%2C0_140037%3A1%2C0_140033%3A1%2C0_140054%3A1%2C0_140050%3A1%2C101%3A1%2C102%3A1%2C103%3A1%2C104%3A1%2C105%3A1%2C106%3A1%2C107%3A1%2C108%3A1%2C109%3A1%2C110%3A1%2C111%3A1%2C112%3A1%2C113%3A1%2C114%3A1%2C115%3A1%2C116%3A1%2C117%3A1%2C118%3A1%2C119%3A1%2C120%3A1%2C121%3A1%2C122%3A1%2C123%3A1%2C124%3A1%2C125%3A1%2C126%3A1%2C127%3A1%2C128%3A1%2C129%3A1%2C130%3A1%2C131%3A1%2C132%3A1%2C133%3A1%2C134%3A1%2C135%3A1%2C136%3A1%2C137%3A1%2C138%3A1%2C139%3A1%2C140%3A1%2C141%3A1%2C142%3A1%2C143%3A1%2C144%3A1%2C145%3A1%2C146%3A1%2C147%3A1%2C148%3A1%2C149%3A1%2C150%3A1%2C151%3A1%2C152%3A1%2C153%3A1%2C154%3A1%2C155%3A1&AwaitingReconsent=false; _tb_sess_r=; _tb_t_ppg=https%3A//apnews.com/245117b7dafd4790ba3d51db06cf345a; _gat=1; _chartbeat2=.1556266696382.1557028669628.1111100001.Vfd8vwJvnJujXq7Dq7JmkgXZfl.4'}
     self.downloadPath = '/data/crawler'
     self.picPath = '/ap_news/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #2
0
ファイル: medium_news.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
     self.cookies = {'cookie': '__cfduid=d6ba6448200002747444269a19593dbdd1555908016; __cfruid=985eba5fa2a449247bfd0598c1c1c5ec968a9416-1558490711; _ga=GA1.2.1064338879.1558490714; _gid=GA1.2.136804405.1558490714; lightstep_guid/medium-web=8f0cd65b0ef4abdb; lightstep_session_id=fcac5cf910466bc4; pr=1; tz=-480; uid=3314454e53ae; sid=1:4N4F93p0H1gPvFCIGldZUdIdQeFiifNF6stzqPFyBikCsGpjcmnIyu/NNWwIVVTx; xsrf=89TsRPcZaZKu; lightstep_guid/lite-web=7d9b16045b97b840; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://medium.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1558512703778%2C%22slts%22:1558503751909}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:3%2C%22last_session_ts%22:1558512703778}; sz=1905'}
     self.downloadPath = '/data/crawler'
     self.picPath = '/huffpost/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #3
0
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/smartNews/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #4
0
 def __init__(self):
     self.news_api = NewsApiClient(api_key='f04f7a8db32841299d4a7fae723e61b2')
     self.t = time.time()
     self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t))
     self.keyword = ['us', 'word', 'opinion', 'politics', 'entertainment', 'lifestyle', 'health', 'travel', 'autos']
     self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
     self.cookies = {'cookie': '_cb_ls=1; optimizelyEndUserId=oeu1556269407120r0.4256555044820445; cto_lwid=a3569f8e-fd62-48fd-8cf3-52e3a3d49218; _gcl_au=1.1.1392012605.1556269408; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22cfa5a6d1-cac6-4a48-97ed-e2a25488a94a%22; _ga=GA1.2.353904812.1556269412; _cb=D6-ViRhsUuoBSGama; __gads=ID=0a226a472ca026e8:T=1556269422:S=ALNI_Mb8qEqiRmqgHFem87cBOSEiCTTaJQ; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _scid=47caae3a-e216-48d9-8cdc-3159238a7671; FXN_flk=1; AMCVS_17FC406C5357BA6E0A490D4D%40AdobeOrg=1; _gid=GA1.2.1114110874.1557801782; s_cc=true; _csrf=qWmVWRGxKfzqXCxI9_yuGfZI; s_sq=%5B%5BB%5D%5D; AKA_A2=A; ak_bmsc=3362DC65CD8C5F6FE2F5F2E24D7DD7FE6876060DED3200004C65DA5C0E141B34~pl9V8ncmx0JI/913nUJgfYoKX6Gte64URfMw4gBpTiaQPEzpKVnyOxRIc/NBeHS9HwdJZ+Fd5cB6oDFLpRNLt93qTu4fSjWuP7e+PZea5EArlAr63c0rHI5P+U7hKycyZfvpMt2MSsmqLqtUqZqavEQxBprGj74WIJ0a5ZnH2vSP1CYH+4ijzZPqw/REPx+WlZ+jHCptyFj7C9pjBHstMpWmr4RW6NTHMwyBsckJbiQr0p+5gPNq/FUjz06HN7q/b4; _cb_svref=null; AMCV_17FC406C5357BA6E0A490D4D%40AdobeOrg=2121618341%7CMCIDTS%7C18031%7CMCMID%7C37985443320715041480395091296536963184%7CMCAAMLH-1557842971%7C7%7CMCAAMB-1558421455%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1557823855s%7CNONE%7CMCAID%7CNONE; s_pers=%20s_ppn%3Dfnc%253Aroot%253Aroot%253Achannel%7C1557806491239%3B%20omtr_lv%3D1557816723185%7C1652424723185%3B%20omtr_lv_s%3DLess%2520than%25201%2520day%7C1557818523185%3B%20s_nr%3D1557816723191-Repeat%7C1560408723191%3B; _chartbeat2=.1556269420027.1557816723254.0000000010000001.CY0VlWCO9QUgDFFbO8QLxoyCPj7ho.2; s_sess=%20omtr_evar17%3DD%253Dc17%3B%20s_ppvl%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C22%252C83%252C5886%252C1920%252C925%252C1920%252C1080%252C1%252CL%3B%20SC_LINKS%3D%3B%20s_ppv%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C63%252C96%252C3550%252C1920%252C969%252C1920%252C1080%252C1%252CL%3B; criteo_write_test=ChUIBBINbXlHb29nbGVSdGJJZBgBIAE; bm_sv=8A6F070ED17B9F85AD022D562A830573~oN82OtrVhgL99OXQYjpsFWPKOuwBoUVwy60qge23Kx9pNN2MIe3/AhQZJZ+na42MjDAIyCRuvDS6csM6csNzVnCY/0Ue7dXJIHzFvEjq/KcL+5X57fiZK5b9W/W3g/hw1kSCvVxA/GNO4h9IlDmY6OElMgVSqN2h9kq42m6z+n0='}
     self.downloadPath = '/data/crawler'
     self.picPath = '/fox_news/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #5
0
ファイル: buzzfeed_news.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         '_ga=GA1.2.2006098489.1555559856; _fbp=fb.1.1555559860721.1190642659; __qca=P0-464700868-1555559857580; permutive-id=35526ebd-337f-4b00-bf5b-10a6610a85a5; __gads=ID=2cb4be529258fba6:T=1555559912:S=ALNI_MawMBKcEjsbSC3roAOVcCQm5lCB2A; _pdfps=%5B7684%2C13160%2C13164%2C13319%2C13730%2C14474%2C10166%2C12448%2C12449%2C12882%2C13097%2C13214%2C13217%2C13276%2C13278%2C13834%2C14353%2C10748%2C10788%2C13102%2C13144%2C13145%2C13146%2C13147%2C13150%2C13151%2C13157%2C13163%2C13169%2C13667%2C14437%2C14458%2C10224%2C10915%2C13153%2C13675%2C14142%2C13064%2C13216%2C13279%2C14431%2C14432%2C10749%2C10789%2C10906%2C10916%2C10917%2C11655%2C12233%2C12244%2C12679%2C12985%2C13099%2C13101%2C13148%2C13244%2C13741%2C13742%2C14143%2C14479%2C14872%2C15077%2C15128%2C15139%2C10222%2C13100%2C10216%2C%2212244-15-22969%22%2C%2212244-15-22970%22%2C%2212679-5-118997US%22%2C%2212985-5-118497US%22%2C%2213244-5-325997US%22%2C%2213245-5-325997US%22%2C%2213246-5-325997US%22%2C%2213458-15-22969%22%2C%2213458-15-22970%22%2C%2213459-15-22969%22%2C%2213459-15-22970%22%2C%2214229-5-318346US%22%2C%2214351-15-22835%22%2C%2214479-5-325547US%22%2C%2214872-15-22835%22%2C%2214872-15-22814%22%2C%2215063-5-318346US%22%2C%2215063-5-325346US%22%5D; permutive-session=%7B%22session_id%22%3A%2215c3b65a-580a-47aa-ab95-b44076421376%22%2C%22last_updated%22%3A%222019-04-27T02%3A27%3A38.962Z%22%7D; _cmpQcif3pcsupported=1; _gid=GA1.2.13310005.1557196067; _gat=1; sailthru_pageviews=4; sailthru_content=cbe347ea3dd8f028b2a79dd2124b2609d73dc57549ee138bd1d9dedee18e797c3cde4668fc0929097a33767e5b408948300e3683df34cf01dce50805bbb1306ce0bce460f7e70fed288b52d84bd9816499693f0167a253c9d1ba851de3a9d8e9dd7ae6730eff39df6f3b2fee47cae2908e3260668e0361ea9bd2ebb68e2a0591e9ec864cd274cc1d8b3a98016c2bcf1d874e57a78b55d2f981aeb6d2c79bfecc9d43236330abbff1afd96b7ffa626bb4936065bb0196c7181b628021dea483cf13a2f044347925f429d5fbc7008162c9cd736b79ca68d62341101204bca0cca1ff22ee54be7fa316d48db768db05dda4f044956926b209e90497a64953e290f7; sailthru_visitor=a057d87e-b51c-4f1e-9167-d146c2a3a7bc'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/buzzfeed/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #6
0
ファイル: nypost_news.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         'optimizelyEndUserId=oeu1555918735298r0.6812295616493853; _ga=GA1.2.501461025.1557229334; __pnahc=0; __tbc=%7Bjzx%7DbOREsfUR6SMcRp5niCu4XyJqGIm9xLbU2svbGCB3e5Y-ZlpcwIXRF_gOx5ssrGMlRCzVeeO-JA50xgthIobqDMJS2og0GbDQCa7bPklPxk1yFokaLXVHvRa0s4J7s817Uqt8s09tJ4GcmUzNoGeVhA; __pat=-14400000; __gads=ID=b83698edc796f48a:T=1557229341:S=ALNI_MZYuxiKvMlIMXV92xfTw1XB6ms7EA; __qca=P0-643878854-1557229359497; _gid=GA1.2.1904385410.1557829332; _ncg_g_id_=9d6dce3e-2d15-45b6-a948-9dbb7fa69171; OX_plg=pm; _ncg_id_=16a959d5d8b-5c91174d-2719-4974-99d8-33e86e4219c2; _pc_morningReportRan=true; _sp_ses.3725=*; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://nypost.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1557886391506%2C%22slts%22:1557829347024}; _parsely_visitor={%22id%22:%2236e3895b-5884-4e1e-b290-0b0a1e631850%22%2C%22session_count%22:3%2C%22last_session_ts%22:1557886391506}; AMP_TOKEN=%24NOT_FOUND; bounceClientVisit2045v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgHYCeEA9iggHQDGFAtkcQKYDu6BIANCAE4wQZStXpMQAXyA; _ncg_sp_ses.64db=*; _gat=1; __idcontext=eyJjb29raWVJRCI6IlpORUtKQzRVUEwzNUhUUTI2QkNJVUIySDZZVUFOTjI1V0E0VzI1WERQSU9RPT09PSIsImRldmljZUlEIjoiWk5FS0pDNFVPVFU1NzVaQTVZSU1LVlNHUlVFU0pOM0JWNEdTWVJHNUM0WEE9PT09IiwiaXYiOiJQR1ZMV1NQWTc2R0pGMkhISUJCVEpBTEc0UT09PT09PSIsInYiOjF9; __pvi=%7B%22id%22%3A%22v-2019-05-15-11-28-33-472-TCTTCzJ3MeatqGej-a5838d1dc51fc02369a5c570d5bb61d6%22%2C%22domain%22%3A%22.nypost.com%22%2C%22time%22%3A1557891488612%7D; __adblocker=false; xbc=%7Bjzx%7DCTyXA66nwH4u0LSnMj_hrMtwYTk54JF59dLs5o_wp3snMXNdvj2Yy6TBtbRxyGxf14_VW1q5TLlW6vo43sH4bt1xlU681XmGmmXaT-SetcMReVqnxTFjI2gW-7RAeJAQFo8mvk88JA2ghePCorbhbWMs02tfzF_-k1Krwk0Vz5I_4BWDD33FM1fohQjjcgYaPM-1rt-sKsCEnjEZlCFDpqiFO54mgbKUB-kFVcHhi-_WjEFJazS2Vtn_ZZJHi-y44g16CXbGiqpHfoDR9DPafHAts-4n-G65fMRtwt9Ml8JaS73yz78cdU_g515IoAaF5TiHkpwV8OOumbfwBrkq2AU3h3dtbnjKZd070tIlyyZdFCbfpjqxaxax2jiN0PitRuCioMt8p4TO3fxq6ok4tA; _ncg_sp_id.64db=d08b0f08-4e58-40a9-8cd3-63efa5ae79b6.1557229345.5.1557891492.1557886389.183b444b-a79a-4498-bd82-c06c607b176e; _sp_id.3725=b96eefdc3adfd036.1557229358.3.1557891502.1557831911'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/nypost/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #7
0
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         'rxx=293dmvskaws.1i9frk99&v=1; _ga=GA1.2.893461412.1556504122; _fbp=fb.1.1556504132562.1028440015; __pnahc=0; __pat=-25200000; OTH=v=1&d=eyJraWQiOiIwMTY0MGY5MDNhMjRlMWMxZjA5N2ViZGEyZDA5YjE5NmM5ZGUzZWQ5IiwiYWxnIjoiUlMyNTYifQ.eyJjdSI6eyJndWlkIjoiUDJCVVRPR0RVT0VGVERUV0pNTFdKNFlFSDQiLCJzaWQiOiJEYTNsbjdKbG11MmwifX0.HoiBv5OlQvNY2x6q-LJBN-VzgCErT7GTCnODqLLQ8foasqTVUCPVXvwHFniFc7CwCf0n7lmSgfrSycQNevIFSJHZ7M-S9SRQH4FMtu91qykbuvAzAOQZRw_iz_warZWFJtpIys0EVH4Gn9wYqaqLXv-5lO39fuPsqJx9z7X6luQ; BX=e7bndb9eccnhl&b=4&d=lrdDlyNpYELw7nQr45ylAA--&s=8v&i=_g5gVQJJ34nc.9WZ3JGN; GUC=AQEAAQJc0SBdu0IgxgTZ&s=AQAAAJQ2Kk5V&g=XM_RDg; __tbc=%7Bjzx%7DjGAToaZMxJYLoS7N4KRjDaxHalABoj31MSFHkZP0UNxHLBrPMu6clUAaZwsaHnUnQQaMDnEIRO1fDpAMrkMVflCNhUFWsFFB8n1hsUBhKEKL38bZEAUprS1G6wPj4GNM4bchi9l7YPvr6or9wrNMLmWzw2hPXY5j7UVUWDOUH_U; __pcvc={}; _parsely_session={%22sid%22:6%2C%22surl%22:%22https://techcrunch.com/2019/05/04/uber-is-facing-australian-class-action-suit-alleging-unlawful-conduct/%22%2C%22sref%22:%22%22%2C%22sts%22:1558071232131%2C%22slts%22:1557885616789}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:6%2C%22last_session_ts%22:1558071232131}; cmp=t=1558071232&j=0; _gid=GA1.2.1358424281.1558071235; _gat=1; __adblocker=false; xbc=%7Bjzx%7DYW6Rlvft6bPCfQyJ3DedvFReFNeSWzD34uqjUgyftdmRMMeJaQrGxlc0RnHslaNJuW923ovrMyh3fAAIY_x7R_Da15zP9YopEn3Om90NI0T5GRkVz40I1R8zV8ZQB68kBF2YuF_JsLshS1YKLFcyLSN12KbxNP4vrnBqkqtIO2yaJ5LoTRrcAPA64ePs4VtlokVTqGlotnhRSiMBSeplyP6M0a5Lj5rCIn1GIetfFxi-gIZuaMlkdAHSSmrqD1nfLBrQXcHSWrDRR0PGzzVvFjSVEXhIbldyChWDeAkkgN0hgI8KXA304yID8T-gx9UZiwWN897EFpRv3ZNtbg5IqW5GixrDYN1X7y_FdQGe5c4Tlz-figdB5Mbe5Qj2godX23QAk9Y6PbNudCC8Em1tgOzteL0CnIShQ--XvwA9qsvEZSWlAxrGfFmStXYiVaTRc1BM1DSemqPeEIoI_XtXT1h-FOYTaDZfqgflEl3Qb8MlWCowztRcnRznul-OxLIUMkPAraljlm83Bs9Z0ZZTeULOzew-rPTbrZfnXeQjr8OJtrUbNexMaJib654rgmNL7kXPxNmVdB1ZWX5IXEgmiW4XKjZACr0RxZbzXhXFfEN9gPbI7xVJJD8kfmfWoGW_0O6MebIrRbW8xxFPLY90Mw; __pvi=%7B%22id%22%3A%22v-2019-05-17-13-33-55-875-qtmTHL61BOYny2co-1510a80d282f15b71b1e5f4d8bc358ee%22%2C%22domain%22%3A%22.techcrunch.com%22%2C%22time%22%3A1558072171993%7D'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/techcrunch/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #8
0
ファイル: matador_network.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         '_ga=GA1.2.1006188425.1558506407; __auc=c07ed86116ade38958b6f215c90; __gads=ID=ebab27bbe751d3d9:T=1558506409:S=ALNI_MbKANVLVZlZmub7wcHXVdRq__9uAQ; _fbp=fb.1.1558506413908.2064431331; cache-primed=1; mn-push-status=8; EU=(null); _gid=GA1.2.549309516.1558921947; __asc=2b5b28f116af808ea5c6cf504f0'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/matador_network/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #9
0
ファイル: huffpost_news.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         'BX=cj1ovi5ee464n&b=3&s=nk; rxx=aflhe5fyk20.1j3ho4gz&v=1; _fbp=fb.1.1558321313127.2047077689; GUC=AQEBAQFc42Vdw0If_QRY&s=AQAAAB6nBWF3&g=XOIY0w; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _tb_sess_r=https%3A//www.huffpost.com/topic/nsfw%3Fpage%3D1; GED_PLAYLIST_ACTIVITY=W3sidSI6Im9TTWYiLCJ0c2wiOjE1NTgzNDA3MTMsIm52IjoxLCJ1cHQiOjE1NTgzNDA3MDYsImx0IjoxNTU4MzQwNzEzfSx7InUiOiIxSmhlIiwidHNsIjoxNTU4MzQwNjQ5LCJudiI6MSwidXB0IjoxNTU4MzQwNjM5LCJsdCI6MTU1ODM0MDY0OX1d; _tb_t_ppg=https%3A//www.huffpost.com/entry/nobuyoshi-araki-museum-of-sex_n_5a7c8c38e4b0c6726e10b29d'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/huffpost/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #10
0
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
     }
     self.cookies = {
         'cookies':
         'fly_device=desktop; fly_geo={"countryCode": "cn"}; CBS_INTERNAL=0; _cb_ls=1; _cb=DrObeWDJQRFdCPmQx1; optimizelyEndUserId=oeu1556274100628r0.4116041118910556; __gads=ID=d68306632b854d8c:T=1556274103:S=ALNI_MYpAOeaoN_TEKi9ErEphorJuu4FxA; aam_uuid=38178500434044041890375836043549172921; _v__chartbeat3=DSbaGWCHXxS0C6XCeZ; first_page_today=false; cbsnews_ad=%7B%22type%22%3A%22gpt%22%2C%22region%22%3A%22aw%22%2C%22session%22%3A%22a%22%2C%22subSession%22%3A%223%22%7D; AMCVS_10D31225525FF5790A490D4D%40AdobeOrg=1; s_cc=true; OX_plg=pm; fly_vid=1a29bea6-1a13-4100-a305-ffa9b02166d3; pmtimesig=[[1556347239934,0],[1556350240525,3000591],[1556372772902,22532377]]; s_vnum=1558866104445%26vn%3D10; s_invisit=true; s_lv_undefined_s=Less%20than%201%20day; AMCV_10D31225525FF5790A490D4D%40AdobeOrg=1406116232%7CMCMID%7C37954619966530193010387509759393309121%7CMCAAMLH-1557023341%7C11%7CMCAAMB-1557023341%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1556425741s%7CNONE%7CvVersion%7C2.5.0; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; AAMC_cbsi_0=REGION%7C11%7CAMSYNCSOP%7C%7CAMSYNCS%7C; _cb_svref=null; _t_tests=eyJMdFRUYmdVZHBDcHBKIjp7ImNob3NlblZhcmlhbnQiOiJCIiwic3BlY2lmaWNMb2NhdGlvbiI6WyJEZlhyTVYiXX0sImxpZnRfZXhwIjoibSJ9; cbsn_device=desktop; muxData=mux_viewer_id=a3de65c6-88bd-4042-a748-fb385d2ada3d&msn=0.5261598146217972&sid=11df9f3c-9e4d-47e4-9786-2de0583451e8&sst=1556418792060&sex=1556421954813; GED_PLAYLIST_ACTIVITY=W3sidSI6ImdDTUIiLCJ0c2wiOjE1NTY0MjA0NTUsIm52IjoxLCJ1cHQiOjE1NTY0MjAxNDIsImx0IjoxNTU2NDIwNDU1fV0.; s_sq=%5B%5BB%5D%5D; prevPageType=topic_list; prevPageName=cbsnews:/latest/us/5/; s_getNewRepeat=1556420875652-Repeat; s_lv_undefined=1556420875654; utag_main=v_id:016a592a36a1009f5e955a97097003079001807100bd0$_sn:10$_ss:0$_st:1556422675588$vapi_domain:cbsnews.com$dc_visit:10$_pn:38%3Bexp-session$ses_id:1556418538777%3Bexp-session$dc_event:30%3Bexp-session$dc_region:eu-central-1%3Bexp-session; _chartbeat2=.1556274100027.1556420876067.111.atSntCpXEouDM4RkLBcjI23BVm-lP.40; s_ptc=%2Flatest%2Fus%2F5%2F%5E%5E0.00%5E%5E0.01%5E%5E0.28%5E%5E0.52%5E%5E0.63%5E%5E0.44%5E%5E5.08%5E%5E0.01%5E%5E6.59; RT="sl=38&ss=1556418537489&tt=40674&obo=1&sh=1556420880100%3D38%3A1%3A40674%2C1556420718464%3D37%3A1%3A34088%2C1556420455825%3D36%3A1%3A31715%2C1556420142482%3D35%3A1%3A30988%2C1556420128526%3D34%3A1%3A30943&dm=cbsnews.com&si=91b57407-760b-481b-87e3-bcff31d166db&bcn=%2F%2F173e2514.akstat.io%2F&ld=1556420880100&r=https%3A%2F%2Fwww.cbsnews.com%2Flatest%2Fus%2F5%2F&ul=1556420983930"'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/cbs_news/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #11
0
ファイル: uproxx.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         '_ga=GA1.2.1916480018.1557387143; _omappvp=NXTUG1O09XwizTEVPHY0CDatCFaa7zmyENXMZ3yBzNBfpRrUJyJSjzawWbNOtmCk3a0M6l51v1hv01nhoAdHqIQLyxntcGlZ; __gads=ID=967d7ff68a5a2656:T=1557387149:S=ALNI_MZFY5Q_tfI8WS1_30SK817ySI14RQ; _cb_ls=1; _cb=BvCRDMN1EZ-CKUJwA; _scid=12f16568-c9b2-4331-9513-626f26e7aac6; _fbp=fb.1.1558321180103.1335358405; __qca=P0-305115545-1558321179420; _chartbeat2=.1558321174581.1558322228287.1.w0ijvCb8zgbDJfkouB1YhL9BM0Wu2.15; _sctr=1|1559059200000; _gid=GA1.2.676333276.1559707655; _cmpQcif3pcsupported=1; _parsely_visitor={%22id%22:%22f94909f1-8e1d-499d-8590-04e058a8acdf%22%2C%22session_count%22:4%2C%22last_session_ts%22:1559707963140}; _parsely_slot_click={%22url%22:%22https://uproxx.com/dimemag/demarcus-cousins-warriors-game-2-nba-finals-passing-analysis-videos/%22%2C%22x%22:1163%2C%22y%22:0%2C%22xpath%22:%22//*[@id=%5C%22menu-item-1560569%5C%22]/a[1]%22%2C%22href%22:%22https://uproxx.com/news%22}; _threds=1; _thredb=uproxx.76a113a16f1e45e5bf36b23bf05e76a6.1558321178020.1559712981550.1559713181162.30.6; _gat_auPassiveTagger=1; _gat=1'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/uproxx/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #12
0
 def __init__(self):
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
     }
     self.cookies = {
         'Cookie':
         'eu_cookie=1; _ga=GA1.2.716753130.1558322924; __qca=P0-1424062991-1558322923905; __gads=ID=1477ebfc328ade2f:T=1558322965:S=ALNI_MaqMSrLXw4oP8tpYvkTfPLW8rNP8g; OX_ssn=5819416341; _gid=GA1.2.90982873.1559799919; OX_plg=pm; OX_sd=3; looperSessionDepth=3; eu_cookie=1; cuid=5931835b1dcefbdb0a501558923072349_1562391999845; GED_PLAYLIST_ACTIVITY=W3sidSI6IndpbHciLCJ0c2wiOjE1NTk4MDAyOTIsIm52IjoxLCJ1cHQiOjE1NTk4MDAyOTAsImx0IjoxNTU5ODAwMjkxfSx7InUiOiJVYmRFIiwidHNsIjoxNTU5ODAwMDY3LCJudiI6MSwidXB0IjoxNTU5ODAwMDM0LCJsdCI6MTU1OTgwMDA2NX0seyJ1IjoieHl4NiIsInRzbCI6MTU1OTgwMDA1NiwibnYiOjEsInVwdCI6MTU1OTgwMDAzNCwibHQiOjE1NTk4MDAwNTZ9XQ..; _gat=1'
     }
     self.key_word = ['news', 'features', 'movies', 'television', 'comics']
     self.downloadPath = '/data/crawler'
     self.picPath = '/looper/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #13
0
 def __init__(self):
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.   36'
     }
     self.cookies = {
         'cookies':
         'odin_tt=25f29c3c11ab624e32ea123b341f8e8ad3b9254cb1bcb00828ea8bbdf642ee3018a6a10f8ce2d4c3bb22af93a7fbcf4f44f76469931ce1241c8907041d196a1c; tt_webid=6675470162378032646; __tea_sdk__user_unique_id=6675470162378032646; __tea_sdk__ssid=f4cef532-3e68-4425-a4fa-8963bda2fdc3; csrf-token=da1ad8433b7acb6730721e47b072bc7ec710c4e3; csrf-secret=QBi0atkMP4iR2oosQVsHoAxAo7LA2Qzm'
     }
     self.keyword = [
         'foryou', 'entertainment', 'sports', 'lifestyle', 'gaming', 'food',
         'tech', 'autos'
     ]
     self.t = time.time()
     self.downloadPath = '/data/crawler'
     self.picPath = '/topbuzz/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #14
0
ファイル: bbc_news.py プロジェクト: zhangpeng0v0/news
 def __init__(self):
     self.news_api = NewsApiClient(
         api_key='cb7a4ae15a98429890aeedb9a7b460a0')
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         'ckns_orb_fig_cache={%22ad%22:1%2C%22ap%22:4%2C%22ck%22:0%2C%22eu%22:0%2C%22uk%22:0}; ckns_sa_labels_persist={}; ckns_sscid=7f5aa895-8a47-4928-8632-ae8118032bcb; _cb_ls=1; _cb=CF1-z-CgoNtbBVcmNK; ckns_eds=INS-vt29-666188954:923108334-1556503556; ckns_settings-nonce=FwTiPYjnehKUtBu4zb7oIJ4j; amlbcookie=01; ckns_mvt=8c10379c-2f9b-44e6-a88b-97b0315adccb; ckns_account_experiments=j%3A%7B%22accxp_marketing_opt_in_2%22%3A%22control%22%7D; AWSELB=0FC55D47187ECE9190E70C0A017AC69A844CA844E9727B10D1C45E9E505E11A5757E62A62559CE5ECC76BE5C0D98ACC5FFDFADB0DF8505DDE5C427CC6C744FDB90DA13BB15F2555DE48D9361FEFE0FBEA45595E8C7; ckns_stateless=1; ckns_nonce=nzFw9J2FPDEn17WnPYS0LnRO; ckns_id=eyJhYiI6Im8xOCIsImVwIjp0cnVlLCJldiI6ZmFsc2UsInBzIjoicHVmZjhMV3pjSUlfckQ3RlkwaVo1V0dsM3czbFdBWDQ0TmVXNktKYjdDMCIsInNlcy1leHAiOjE1NTY1MTgzOTIwMDAsImp3dC1leHAiOjE2MTk1ODk0OTIwMDAsInRrbi1leHAiOjE1NTY1MjExMzEwMDAsInJ0a24tZXhwIjoxNjE5NTg5NDkyMDAwfQ; ckns_atkn=eyJ0eXAiOiJKV1QiLCJ6aXAiOiJOT05FIiwiYWxnIjoiSFMyNTYifQ.eyJzdWIiOiIzY2RiOWVkOC01ZjdmLTRlZWEtODYxNS1jMzZmMTdhZjZkMzEiLCJjdHMiOiJPQVVUSDJfU1RBVEVMRVNTX0dSQU5UIiwiYXV0aF9sZXZlbCI6MiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MCIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiYWNjZXNzX3Rva2VuIiwidG9rZW5fdHlwZSI6IkJlYXJlciIsImF1dGhHcmFudElkIjoidlhTaHVESDJRc3BOTTItZ0d3ek4yYlBJczRRIiwiYXVkIjoiQWNjb3VudCIsIm5iZiI6MTU1NjUxNzUzMSwiZ3JhbnRfdHlwZSI6InJlZnJlc2hfdG9rZW4iLCJzY29wZSI6WyJleHBsaWNpdCIsImltcGxpY2l0IiwicGlpIiwidWlkIiwib3BlbmlkIl0sImF1dGhfdGltZSI6MTU1NjUxNzQ5MSwicmVhbG0iOiIvIiwiZXhwIjoxNTU2NTI0NzMxLCJpYXQiOjE1NTY1MTc1MzEsImV4cGlyZXNfaW4iOjcyMDAsImp0aSI6IkZXcExhak13bmYxdUJyRWMtY0xaNnlpTUE1cyJ9.OhaC7wNmB_bALESjcJH8JjKcGRa-WaGkcZGWS0rhAtg; ckns_idtkn=eyJ0eXAiOiJKV1QiLCJraWQiOiJIa2d0WDBJd3RDOStSVGQvOWdYdFN0bk9VaU09IiwiYWxnIjoiUlMyNTYifQ.eyJhdF9oYXNoIjoiNGFfU2tJMWtQaVVZbks2VGlNSm9BdyIsInN1YiI6IjNjZGI5ZWQ4LTVmN2YtNGVlYS04NjE1LWMzNmYxN2FmNmQzMSIsImFiIjoibzE4IiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MSIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiaWRfdG9rZW4iLCJhdWQiOiJBY2NvdW50IiwiYWNyIjoiMCIsImF6cCI6IkFjY291bnQiLCJhdXRoX3RpbWUiOjE1NTY1MTc0OTEsInJlYWxtIjoiLyIsImV4cCI6MTU1NjUyMTEzMSwidG9rZW5UeXBlIjoiSldUVG9rZW4iLCJpYXQiOjE1NTY1MTc1MzF9.LqYjXmcfMMVfB3UupPV8oqez0gojKu-9anW-73WVKXOS5deEbwwYMrTr8JQy85WhwzlZNA5e8eqLPWJ_lAgfjCiw60zdYMxM_x_ZYaHtpPtAXf0SCOD8FlTBKnRZYDqKNkj8F22ctDqPUqrRrN-tDVTqrVMYW38sHqBeXalUGkw-2C24UBlE4DFcDqeqjn0pOFbwuFyQpgwrwp1y6UyUvF3WhuB6GVIkkKNUgYbWpnHTmP9OD8DNM_MH9TLDaC9SoRE5py51CpkZ78Y4rnQUAHeHibjbOwLKQkadVGhFzxr4vzxwJlRj_nrCySmrplgDJ7a9P_raVKfL4JH6UeA1_A; atuserid=%7B%22name%22%3A%22atuserid%22%2C%22val%22%3A%222a12b799-e590-476f-9b23-800e48e162f4%22%2C%22options%22%3A%7B%22end%22%3A%222020-05-30T05%3A59%3A40.931Z%22%2C%22path%22%3A%22%2F%22%7D%7D; _chartbeat2=.1556007561538.1556517584378.1000001.BzGCuqD5hQB-BMGlTiNhxZyCFMP2O.1; _cb_svref=https%3A%2F%2Fwww.bbc.com%2Fnews; ckps_id_ptrt=https%3A%2F%2Fwww.bbc.co.uk%2Fprogrammes%2Fw172wy08d8yw9mq; ecos.dt=1556517629804'
     }
     self.t = time.time()
     self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t))
     self.keyword = [
         'News', 'Health', 'Science', 'Entertainment', 'Technology'
     ]
     self.downloadPath = '/data/crawler'
     self.picPath = '/bbc_news/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #15
0
 def __init__(self):
     self.news_api = NewsApiClient(
         api_key='e7d5104fc5c74e259dbe2427b68257fb')
     self.key_word = [
         'U.S.', 'Lifestyle', 'Technology', 'Entertainment', 'Sports',
         'Health'
     ]
     self.t = time.time()
     self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t))
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
     }
     self.cookies = {
         'cookie':
         'cookieMonster=1; _cb_ls=1; SWID=522fc1e1-4ffd-4802-86fa-d7475f8dca57; optimizelyEndUserId=oeu1557122779744r0.05588715173473946; s_vi=[CS]v1|2E67E7728507B2CE-4000011580009D3A[CE]; __gads=ID=b52dc242d5ad893e:T=1557122797:S=ALNI_MaWnnuLLKP88qrPEsPEVuViaoGlJg; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; _cb=Dz5K21B0CX5JDkjMG; _v__chartbeat3=DbUUrKDDGTaEDqauaQ; _cb_svref=null; AkamaiAnalytics_BrowserSessionId=4d41ba72-5ab2-8f33-46aa-f5748aca9647; HTML_VisitIntervalStartTime=1557125661015; s_sess=%20s_cc%3Dtrue%3B%20s_sq%3D%3B; adnum=3undefined; _chartbeat2=.1557122809880.1557125676423.1.DoKBmGC2OW5QDWek5PEERi8oZtYZ.12; HTML_BitRateBucketCsv=0,19083,16715,0,0,0,0,0; HTML_VisitValueCookie=1|1|1|0|35798|35826|0|0|0|0|0|0|NaN; s_pers=%20s_fid%3D22C0AF24132A0778-001FDBB2DA7591AD%7C1620284117936%3B%20s_c20%3D1557125717941%7C1651733717941%3B%20s_c20_s%3DFirst%2520Visit%7C1557127517941%3B; HTML_isPlayingCount=2; GED_PLAYLIST_ACTIVITY=W3sidSI6IlhYU0wiLCJ0c2wiOjE1NTcxMjU3MzgsIm52IjowLCJ1cHQiOjE1NTcxMjU1NDQsImx0IjoxNTU3MTI1NjM3fSx7InUiOiIzbnlXIiwidHNsIjoxNTU3MTI1NzM3LCJudiI6MCwidXB0IjoxNTU3MTI1NTU2LCJsdCI6MTU1NzEyNTYyM30seyJ1IjoiWG81TSIsInRzbCI6MTU1NzEyNTczNywibnYiOjEsInVwdCI6MTU1NzEyNTU4NSwibHQiOjE1NTcxMjU3MzV9XQ..; HTML_VisitCountCookie=1'
     }
     self.downloadPath = '/data/crawler'
     self.picPath = '/abc_news/picture/'
     self.filter = Filter_Data()
     self.save = Save_Data()
コード例 #16
0
class Looper_News(object):
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
        }
        self.cookies = {
            'Cookie':
            'eu_cookie=1; _ga=GA1.2.716753130.1558322924; __qca=P0-1424062991-1558322923905; __gads=ID=1477ebfc328ade2f:T=1558322965:S=ALNI_MaqMSrLXw4oP8tpYvkTfPLW8rNP8g; OX_ssn=5819416341; _gid=GA1.2.90982873.1559799919; OX_plg=pm; OX_sd=3; looperSessionDepth=3; eu_cookie=1; cuid=5931835b1dcefbdb0a501558923072349_1562391999845; GED_PLAYLIST_ACTIVITY=W3sidSI6IndpbHciLCJ0c2wiOjE1NTk4MDAyOTIsIm52IjoxLCJ1cHQiOjE1NTk4MDAyOTAsImx0IjoxNTU5ODAwMjkxfSx7InUiOiJVYmRFIiwidHNsIjoxNTU5ODAwMDY3LCJudiI6MSwidXB0IjoxNTU5ODAwMDM0LCJsdCI6MTU1OTgwMDA2NX0seyJ1IjoieHl4NiIsInRzbCI6MTU1OTgwMDA1NiwibnYiOjEsInVwdCI6MTU1OTgwMDAzNCwibHQiOjE1NTk4MDAwNTZ9XQ..; _gat=1'
        }
        self.key_word = ['news', 'features', 'movies', 'television', 'comics']
        self.downloadPath = '/data/crawler'
        self.picPath = '/looper/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        pg = 12
        while pg < 24:
            for kw in self.key_word:
                url = 'https://www.looper.com/category/{}/?ajax=1&offset={}'.format(
                    kw, pg)
                self.parsing_news_list_page(url=url)
            pg += 12

    def parsing_news_list_page(self, url):
        res = requests.get(url=url, headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(1, 3))
        html = etree.HTML(res)
        url_list = html.xpath('//h3/a/@href')
        for i in url_list:
            status = self.filter.filter_data(details_url=i)
            if status:
                pass
            else:
                self.parsing_details_page(details_url=i)

    def parsing_details_page(self, details_url):
        res = requests.get(url=details_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(1, 3))
        html = etree.HTML(res)
        source = int(13)
        sourceUrl = details_url
        jobId = time.time()
        title = ''.join(html.xpath('//h1[@class="title-gallery"]/text()'))
        authorName = ''.join(
            html.xpath('//div[@class="gallery-info"]/a/text()'))
        releaseTime = ''.join(
            html.xpath('//span[@class="news-timestamp"]/text()'))
        content = self.analysis_news_content(html=res,
                                             html_obj=html,
                                             newspaper=False)
        img = self.analysis_news_img(html_obj=html)
        if img is None or img == '' or content is None or content == '':
            pass
        else:
            data = {
                'source': source,
                'jobId': int(jobId),
                'sourceUrl': sourceUrl,
                'title': title,
                'authorName': authorName,
                'releaseTime': releaseTime,
                'content': content,
                'img': img
            }
            print('data:\n', data)
            self.save.save_data(data=data, news='looper')

    def analysis_news_content(self, html, html_obj, newspaper=False):
        if newspaper:
            text = fulltext(html).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
        else:
            content_list = html_obj.xpath('//div[@id="content"]//p//text()')
            content = '<p>'.join([
                i.replace("\n", '').strip() for i in content_list
            ]).replace("<p><p>", '<p>')
        return content

    def analysis_news_img(self, html_obj):
        pic_url_list = html_obj.xpath('//div[@id="content"]//img/@src')
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        try:
            for pic_url in pic_url_list[:17]:
                urllib.request.urlretrieve(
                    pic_url, r'%s.jpg' % (self.downloadPath + self.picPath +
                                          str(img_id) + "-" + str(index)))
                img_list.append(
                    r'%s.jpg' %
                    (self.picPath + str(img_id) + "-" + str(index)))
                index += 1
            img = ','.join(img_list)
            return img
        except:
            return None
コード例 #17
0
ファイル: huffpost_news.py プロジェクト: zhangpeng0v0/news
class HuffPost_News():
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            'BX=cj1ovi5ee464n&b=3&s=nk; rxx=aflhe5fyk20.1j3ho4gz&v=1; _fbp=fb.1.1558321313127.2047077689; GUC=AQEBAQFc42Vdw0If_QRY&s=AQAAAB6nBWF3&g=XOIY0w; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _tb_sess_r=https%3A//www.huffpost.com/topic/nsfw%3Fpage%3D1; GED_PLAYLIST_ACTIVITY=W3sidSI6Im9TTWYiLCJ0c2wiOjE1NTgzNDA3MTMsIm52IjoxLCJ1cHQiOjE1NTgzNDA3MDYsImx0IjoxNTU4MzQwNzEzfSx7InUiOiIxSmhlIiwidHNsIjoxNTU4MzQwNjQ5LCJudiI6MSwidXB0IjoxNTU4MzQwNjM5LCJsdCI6MTU1ODM0MDY0OX1d; _tb_t_ppg=https%3A//www.huffpost.com/entry/nobuyoshi-araki-museum-of-sex_n_5a7c8c38e4b0c6726e10b29d'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/huffpost/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        pg = 1
        while pg < 11:
            start_url = 'https://www.huffpost.com/topic/nsfw?page={}'.format(
                pg)
            self.parsing_huffpost_news_list(list_url=start_url)
            pg += 1

    def parsing_huffpost_news_list(self, list_url):
        res = requests.get(url=list_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        html = etree.HTML(res)
        news_list_url = html.xpath('//div[@class="card__content"]/a/@href')
        for details_url in news_list_url:
            try:
                self.parsing_details_page_url(details_url=details_url)
            except:
                pass

    def parsing_details_page_url(self, details_url):
        status = self.filter.filter_data(details_url=details_url)
        if status:
            print('Data already exists!')
        else:
            res = requests.get(url=details_url,
                               headers=self.headers,
                               cookies=self.cookies).text
            time.sleep(random.uniform(1, 3))
            html = etree.HTML(res)
            source = int(12)
            jobId = time.time()
            sourceUrl = details_url
            title = ''.join(
                html.xpath('//h1[@class="headline__title"]//text()'))
            authorName = self.analysis_author_name(html=html)
            releaseTime = self.analysis_release_time(html=html).replace(
                "\n", '').strip()
            content = self.analysis_new_content(res=res,
                                                html=html,
                                                newspaper=False)
            img_list = self.analysis_download_img(html=html)
            img = self.download_pic(img_url_list=img_list)
            if img == '' or img is None or content == '' or content is None:
                pass
            else:
                data = {
                    'source': source,
                    'jobId': int(jobId),
                    'sourceUrl': sourceUrl,
                    'title': title,
                    'authorName': authorName,
                    'releaseTime': releaseTime,
                    'content': content,
                    'img': img
                }
                print('data:\n', data)
                self.save.save_data(data=data, news='huffpost')

    def analysis_author_name(self, html):
        authorName = ''.join(
            html.xpath('//div[@class="author-list"]/span/text()'))
        if authorName == '' or authorName is None:
            authorName = ''.join(
                html.xpath('//div[@class="author-card__name"]//text()'))
            return authorName
        else:
            return authorName

    def analysis_release_time(self, html):
        releaseTime_1 = ''.join(
            html.xpath(
                '//div[@class="timestamp timestamp--has-modified-date"]//text()'
            ))
        if releaseTime_1 == '' or releaseTime_1 is None:
            releaseTime_2 = ''.join(
                html.xpath('//div[@class="timestamp"]//text()'))
            if releaseTime_2 == '' or releaseTime_2 is None:
                releaseTime_3 = ''.join(
                    html.xpath(
                        '//div[@class="timestamp timestamp--contributor timestamp--has-modified-date"]//text()'
                    ))
                return releaseTime_3
            else:
                return releaseTime_2
        else:
            return releaseTime_1

    def analysis_new_content(self, res, html, newspaper=False):
        if newspaper:
            text = fulltext(res).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
            return content
        else:
            text = html.xpath(
                '//div[@class="content-list-component yr-content-list-text text"]//p//text()'
            )
            content = '<p>'.join([i.replace("\n", '').strip() for i in text
                                  ]).replace("<p><p>",
                                             '<p>').replace("<p>,<p>", ' ')
            return content

    def download_pic(self, img_url_list):
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        pic_url_list_1 = [i for i in img_url_list if '.svg' not in i]
        pic_url_list = [j for j in pic_url_list_1 if 'ops=100_100' not in j]
        miss_pic = 'https://img.huffingtonpost.com/asset/default-missing-image.jpg?cache=jio2vozgty&ops=scalefit_970_noupscale'
        if pic_url_list == [] or miss_pic in pic_url_list:
            return None
        else:
            if len(pic_url_list) < 18:
                pic_list = pic_url_list
            else:
                pic_list = pic_url_list[:17]
            for pic_url in pic_list:
                urllib.request.urlretrieve(
                    pic_url, r'%s.jpg' % (self.downloadPath + self.picPath +
                                          str(img_id) + "-" + str(index)))
                img_list.append(
                    r'%s.jpg' %
                    (self.picPath + str(img_id) + "-" + str(index)))
                index += 1
            img = ','.join(img_list)
            return img

    def analysis_download_img(self, html):
        pic_url_list1 = html.xpath(
            '//div[@class="listicle__slide-content"]/img/@src')
        if pic_url_list1 == []:
            pic_url_list2 = html.xpath(
                '//div[@class="entry__body js-entry-body"]//img/@src')
            if pic_url_list2 == []:
                pic_url_list3 = html.xpath(
                    '//div[@class="collection-item image"]//img/@src')
                return pic_url_list3
            else:
                return pic_url_list2
        else:
            return pic_url_list1
コード例 #18
0
class TopBuzz_News(object):
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.   36'
        }
        self.cookies = {
            'cookies':
            'odin_tt=25f29c3c11ab624e32ea123b341f8e8ad3b9254cb1bcb00828ea8bbdf642ee3018a6a10f8ce2d4c3bb22af93a7fbcf4f44f76469931ce1241c8907041d196a1c; tt_webid=6675470162378032646; __tea_sdk__user_unique_id=6675470162378032646; __tea_sdk__ssid=f4cef532-3e68-4425-a4fa-8963bda2fdc3; csrf-token=da1ad8433b7acb6730721e47b072bc7ec710c4e3; csrf-secret=QBi0atkMP4iR2oosQVsHoAxAo7LA2Qzm'
        }
        self.keyword = [
            'foryou', 'entertainment', 'sports', 'lifestyle', 'gaming', 'food',
            'tech', 'autos'
        ]
        self.t = time.time()
        self.downloadPath = '/data/crawler'
        self.picPath = '/topbuzz/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        for cls in self.keyword:
            print('cls:\t', cls)
            url = 'https://www.topbuzz.com/pgc/feed?content_space=bd&language=en&region=us&user_id=6675470162378032646' \
                  '&channel_name=' + cls + \
                  '&classification=all' \
                  '&max_behot_time=' + str(self.t)
            self.parsing_topBuzz_list_page(list_url=url)

    def parsing_topBuzz_list_page(self, list_url):
        res = requests.get(url=list_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        data = json.loads(res)
        item = data['data']['feed']['items']
        for i in range(len(item)):
            group_id = item[i]['group_id']
            impr_id = item[i]['impr_id']
            user_id = item[i]['author_info']['user_id']
            detail_url = 'https://www.topbuzz.com/a/' \
                         + group_id + \
                         '?app_id=1106' \
                         '&gid=' + group_id + \
                         '&impr_id=' + impr_id + \
                         '&language=en' \
                         '&region=us' \
                         '&user_id=' + user_id + \
                         '&c=sys'
            status = self.filter.filter_data(details_url=detail_url)
            if status:
                print('Data already exists!')
            else:
                self.parsing_details_page(details_url=detail_url)

    def parsing_details_page(self, details_url):
        time.sleep(random.uniform(1, 3))
        res = requests.get(url=details_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        html = etree.HTML(res)
        source = int(1)
        jobId = time.time()
        sourceUrl = details_url
        title = ''.join(html.xpath('//div[@class="title"]/text()'))
        authorName = ''.join(html.xpath('//div[@class="name active"]/text()'))
        releaseTime = ''.join(html.xpath('//div[@class="publishTime"]/text()'))
        content = self.parsing_news_content(res=res, html=html, newspaper=True)
        img = self.download_img(html=html)
        if img is None or img == '' or content is None or content == '':
            pass
        else:
            data = {
                'source': source,
                'jobId': int(jobId),
                'sourceUrl': sourceUrl,
                'title': title,
                'authorName': authorName,
                'releaseTime': releaseTime,
                'content': content,
                'img': img
            }
            print('data:\n', data)
            self.save.save_data(data=data, news='topBuzz')

    def parsing_news_content(self, res, html, newspaper=False):
        try:
            if newspaper:
                text = fulltext(res).split('\n')
                txt = list(filter(lambda x: x.strip() != '', text))
                content = '<p>'.join(txt)
                return content
            else:
                text = html.xpath(
                    '//div[@class="editor-container"]//p//text()')
                content = '<p>'.join([
                    i.replace("\n", '').strip() for i in text
                ]).replace("<p><p>", '<p>')
                return content
        except:
            return None

    def download_img(self, html):
        pic_url_list = html.xpath('//main//img//@src')
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        if pic_url_list == []:
            pass
        else:
            try:
                pic_list = [i for i in pic_url_list if 'https' not in i]
                for pic_url in pic_list[:17]:
                    urllib.request.urlretrieve(
                        'https:' + pic_url,
                        r'%s.jpg' % (self.downloadPath + self.picPath +
                                     str(img_id) + "-" + str(index)))
                    img_list.append(
                        r'%s.jpg' %
                        (self.picPath + str(img_id) + "-" + str(index)))
                    index += 1
                img = ','.join(img_list)
                return img
            except:
                return None
コード例 #19
0
ファイル: ap_news.py プロジェクト: zhangpeng0v0/news
class Associated_Press_News(object):
    def __init__(self):
        self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
        self.cookies = {'cookie': '_cb_ls=1; _cb=ChGdwsejPcBwqK1A; _ga=GA1.2.1067424464.1556266698; __gads=ID=b2804ef9280ce726:T=1556266708:S=ALNI_MbsZp6KMsLTd9MAhzM98UpWqF4sEQ; __qca=P0-112096547-1556266838413; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; GED_PLAYLIST_ACTIVITY=W3sidSI6Ilp4Q0YiLCJ0c2wiOjE1NTY2MTc5NjcsIm52IjowLCJ1cHQiOjE1NTY2MTc5NjAsImx0IjoxNTU2NjE3OTYwfV0.; _gid=GA1.2.1304411157.1557027854; _cb_svref=null; OptanonConsent=landingPath=NotLandingPage&datestamp=Sun+May+05+2019+11%3A44%3A56+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=4.1.0&EU=false&groups=0_140011%3A1%2C1%3A1%2C0_140010%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2C0_140046%3A1%2C0_140042%3A1%2C0_140038%3A1%2C0_140034%3A1%2C0_140055%3A1%2C0_140051%3A1%2C0_140047%3A1%2C0_140043%3A1%2C0_140039%3A1%2C0_140035%3A1%2C0_140031%3A1%2C0_140052%3A1%2C0_140048%3A1%2C0_140044%3A1%2C0_140040%3A1%2C0_140036%3A1%2C0_140032%3A1%2C0_140053%3A1%2C0_140049%3A1%2C0_140045%3A1%2C0_140041%3A1%2C0_140037%3A1%2C0_140033%3A1%2C0_140054%3A1%2C0_140050%3A1%2C101%3A1%2C102%3A1%2C103%3A1%2C104%3A1%2C105%3A1%2C106%3A1%2C107%3A1%2C108%3A1%2C109%3A1%2C110%3A1%2C111%3A1%2C112%3A1%2C113%3A1%2C114%3A1%2C115%3A1%2C116%3A1%2C117%3A1%2C118%3A1%2C119%3A1%2C120%3A1%2C121%3A1%2C122%3A1%2C123%3A1%2C124%3A1%2C125%3A1%2C126%3A1%2C127%3A1%2C128%3A1%2C129%3A1%2C130%3A1%2C131%3A1%2C132%3A1%2C133%3A1%2C134%3A1%2C135%3A1%2C136%3A1%2C137%3A1%2C138%3A1%2C139%3A1%2C140%3A1%2C141%3A1%2C142%3A1%2C143%3A1%2C144%3A1%2C145%3A1%2C146%3A1%2C147%3A1%2C148%3A1%2C149%3A1%2C150%3A1%2C151%3A1%2C152%3A1%2C153%3A1%2C154%3A1%2C155%3A1&AwaitingReconsent=false; _tb_sess_r=; _tb_t_ppg=https%3A//apnews.com/245117b7dafd4790ba3d51db06cf345a; _gat=1; _chartbeat2=.1556266696382.1557028669628.1111100001.Vfd8vwJvnJujXq7Dq7JmkgXZfl.4'}
        self.downloadPath = '/data/crawler'
        self.picPath = '/ap_news/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()



    def run(self):
        news_dic = {
            'top' : 'https://apnews.com/apf-topnews',
            'sport' : 'https://apnews.com/apf-sports',
            'entertainment' : 'https://apnews.com/apf-entertainment',
            'travel' : 'https://apnews.com/apf-Travel',
            'technology' : 'https://apnews.com/apf-technology',
            'lifestyle' : 'https://apnews.com/apf-lifestyle',
            'business' : 'https://apnews.com/apf-business',
            'usNews' : 'https://apnews.com/apf-usnews',
            'health' : 'https://apnews.com/apf-Health',
            'science' : 'https://apnews.com/apf-science',
            'intlNews' : 'https://apnews.com/apf-intlnews',
            'politics' : 'https://apnews.com/apf-politics',
        }
        for url in news_dic:
            print('newsUlr:\n', url)
            try:
                ap.parsing_news_list_page(news_start_url=news_dic[url])
            except:
                pass


    def parsing_news_list_page(self, news_start_url):
        list_page_html = requests.get(url=news_start_url, headers=self.headers, cookies=self.cookies).text
        time.sleep(random.uniform(2, 5))
        list_html_obj = etree.HTML(list_page_html)
        list_page_url = list_html_obj.xpath('//a[@class="headline"]/@href')
        list_url = ['https://apnews.com' + i for i in list_page_url if 'https://apnews.com' not in i]
        for details_url in list_url:
            result=self.filter.filter_data(details_url=details_url)
            if result:
                print('Data already exists!')
            else:
                self.parsing_details_page(details_url=details_url)


    def parsing_details_page(self, details_url):
        details_html = requests.get(url=details_url, headers = self.headers, cookies = self.cookies).text
        time.sleep(random.uniform(1, 3))
        html_obj = etree.HTML(details_html)
        source = int(4)
        sourceUrl = details_url
        jobId = time.time()
        title = ''.join(html_obj.xpath('//div[@class="headline"]//h1/text()'))
        authorName = ''.join(html_obj.xpath('//span[@class="byline"]/text()'))
        releaseTime = ''.join(html_obj.xpath('//span[@class="Timestamp"]/@data-source'))
        content = self.parsing_news_content(content_html =details_html)
        img_urls = html_obj.xpath('//a[@class="LeadFeature LeadFeature_gallery"]/@href')
        if img_urls == [] or img_urls is None:
            pass
        else:
            img = self.download_picture(html=details_html)
            if img is None or img == '':
                pass
            else:
                data = {'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName,
                        'releaseTime': releaseTime, 'content': content, 'img': img}
                print('data:\n', data)
                self.save.save_data(data=data, news='ap')


    def parsing_news_content(self, content_html):
        text = fulltext(content_html).split('\n')
        txt = list(filter(lambda x: x.strip() != '', text))
        content = '<p>'.join(txt)
        return content


    def download_picture(self, html):
        try:
            url_list = self.analysis_pic_url(html=html)
            img_id = str(uuid.uuid4()).replace('-','')
            index = 1
            img_list = []
            for pic_url in url_list[:17]:
                urllib.request.urlretrieve(pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index)))
                img_list.append(r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)))
                index += 1
            img = ','.join(img_list)
            return img
        except:
            pass


    def analysis_pic_url(self,  html):
        html_script = r'<script>(.*?)</script>'
        script = re.findall(html_script, html, re.S | re.M)
        mediumIds_rule = r'mediumIds(.*?)]'
        rule = re.compile(mediumIds_rule)
        result = rule.findall(script[0])[0][3:]
        result = "[" + result + "]"
        js = json.loads(result)
        url_list = []
        for i in js:
            url = 'https://storage.googleapis.com/afs-prod/media/' + i + '/' + '600.jpeg'
            url_list.append(url)
        return url_list
コード例 #20
0
ファイル: bbc_news.py プロジェクト: zhangpeng0v0/news
class BBC_News():
    def __init__(self):
        self.news_api = NewsApiClient(
            api_key='cb7a4ae15a98429890aeedb9a7b460a0')
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            'ckns_orb_fig_cache={%22ad%22:1%2C%22ap%22:4%2C%22ck%22:0%2C%22eu%22:0%2C%22uk%22:0}; ckns_sa_labels_persist={}; ckns_sscid=7f5aa895-8a47-4928-8632-ae8118032bcb; _cb_ls=1; _cb=CF1-z-CgoNtbBVcmNK; ckns_eds=INS-vt29-666188954:923108334-1556503556; ckns_settings-nonce=FwTiPYjnehKUtBu4zb7oIJ4j; amlbcookie=01; ckns_mvt=8c10379c-2f9b-44e6-a88b-97b0315adccb; ckns_account_experiments=j%3A%7B%22accxp_marketing_opt_in_2%22%3A%22control%22%7D; AWSELB=0FC55D47187ECE9190E70C0A017AC69A844CA844E9727B10D1C45E9E505E11A5757E62A62559CE5ECC76BE5C0D98ACC5FFDFADB0DF8505DDE5C427CC6C744FDB90DA13BB15F2555DE48D9361FEFE0FBEA45595E8C7; ckns_stateless=1; ckns_nonce=nzFw9J2FPDEn17WnPYS0LnRO; ckns_id=eyJhYiI6Im8xOCIsImVwIjp0cnVlLCJldiI6ZmFsc2UsInBzIjoicHVmZjhMV3pjSUlfckQ3RlkwaVo1V0dsM3czbFdBWDQ0TmVXNktKYjdDMCIsInNlcy1leHAiOjE1NTY1MTgzOTIwMDAsImp3dC1leHAiOjE2MTk1ODk0OTIwMDAsInRrbi1leHAiOjE1NTY1MjExMzEwMDAsInJ0a24tZXhwIjoxNjE5NTg5NDkyMDAwfQ; ckns_atkn=eyJ0eXAiOiJKV1QiLCJ6aXAiOiJOT05FIiwiYWxnIjoiSFMyNTYifQ.eyJzdWIiOiIzY2RiOWVkOC01ZjdmLTRlZWEtODYxNS1jMzZmMTdhZjZkMzEiLCJjdHMiOiJPQVVUSDJfU1RBVEVMRVNTX0dSQU5UIiwiYXV0aF9sZXZlbCI6MiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MCIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiYWNjZXNzX3Rva2VuIiwidG9rZW5fdHlwZSI6IkJlYXJlciIsImF1dGhHcmFudElkIjoidlhTaHVESDJRc3BOTTItZ0d3ek4yYlBJczRRIiwiYXVkIjoiQWNjb3VudCIsIm5iZiI6MTU1NjUxNzUzMSwiZ3JhbnRfdHlwZSI6InJlZnJlc2hfdG9rZW4iLCJzY29wZSI6WyJleHBsaWNpdCIsImltcGxpY2l0IiwicGlpIiwidWlkIiwib3BlbmlkIl0sImF1dGhfdGltZSI6MTU1NjUxNzQ5MSwicmVhbG0iOiIvIiwiZXhwIjoxNTU2NTI0NzMxLCJpYXQiOjE1NTY1MTc1MzEsImV4cGlyZXNfaW4iOjcyMDAsImp0aSI6IkZXcExhak13bmYxdUJyRWMtY0xaNnlpTUE1cyJ9.OhaC7wNmB_bALESjcJH8JjKcGRa-WaGkcZGWS0rhAtg; ckns_idtkn=eyJ0eXAiOiJKV1QiLCJraWQiOiJIa2d0WDBJd3RDOStSVGQvOWdYdFN0bk9VaU09IiwiYWxnIjoiUlMyNTYifQ.eyJhdF9oYXNoIjoiNGFfU2tJMWtQaVVZbks2VGlNSm9BdyIsInN1YiI6IjNjZGI5ZWQ4LTVmN2YtNGVlYS04NjE1LWMzNmYxN2FmNmQzMSIsImFiIjoibzE4IiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MSIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiaWRfdG9rZW4iLCJhdWQiOiJBY2NvdW50IiwiYWNyIjoiMCIsImF6cCI6IkFjY291bnQiLCJhdXRoX3RpbWUiOjE1NTY1MTc0OTEsInJlYWxtIjoiLyIsImV4cCI6MTU1NjUyMTEzMSwidG9rZW5UeXBlIjoiSldUVG9rZW4iLCJpYXQiOjE1NTY1MTc1MzF9.LqYjXmcfMMVfB3UupPV8oqez0gojKu-9anW-73WVKXOS5deEbwwYMrTr8JQy85WhwzlZNA5e8eqLPWJ_lAgfjCiw60zdYMxM_x_ZYaHtpPtAXf0SCOD8FlTBKnRZYDqKNkj8F22ctDqPUqrRrN-tDVTqrVMYW38sHqBeXalUGkw-2C24UBlE4DFcDqeqjn0pOFbwuFyQpgwrwp1y6UyUvF3WhuB6GVIkkKNUgYbWpnHTmP9OD8DNM_MH9TLDaC9SoRE5py51CpkZ78Y4rnQUAHeHibjbOwLKQkadVGhFzxr4vzxwJlRj_nrCySmrplgDJ7a9P_raVKfL4JH6UeA1_A; atuserid=%7B%22name%22%3A%22atuserid%22%2C%22val%22%3A%222a12b799-e590-476f-9b23-800e48e162f4%22%2C%22options%22%3A%7B%22end%22%3A%222020-05-30T05%3A59%3A40.931Z%22%2C%22path%22%3A%22%2F%22%7D%7D; _chartbeat2=.1556007561538.1556517584378.1000001.BzGCuqD5hQB-BMGlTiNhxZyCFMP2O.1; _cb_svref=https%3A%2F%2Fwww.bbc.com%2Fnews; ckps_id_ptrt=https%3A%2F%2Fwww.bbc.co.uk%2Fprogrammes%2Fw172wy08d8yw9mq; ecos.dt=1556517629804'
        }
        self.t = time.time()
        self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t))
        self.keyword = [
            'News', 'Health', 'Science', 'Entertainment', 'Technology'
        ]
        self.downloadPath = '/data/crawler'
        self.picPath = '/bbc_news/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        bbc.parsing_bbc_news_list()

    def parsing_bbc_news_list(self):
        today = self.point_time
        for kw in self.keyword:
            print('keyword:\n', kw)
            news_list = self.news_api.get_everything(q=kw,
                                                     sources='bbc-news',
                                                     domains='bbc.co.uk',
                                                     from_param=today,
                                                     to=today[:-1] +
                                                     str(int(today[-1]) - 1),
                                                     language='en',
                                                     sort_by='relevancy',
                                                     page_size=100)
            self.parsing_news_list_url(news_list=news_list)

    def parsing_news_list_url(self, news_list):
        articles = news_list['articles']
        for i in range(len(articles)):
            details_url = articles[i]['url']
            if 'www.bbc.co.uk' in details_url:
                result = self.filter.filter_data(details_url=details_url)
                if result:
                    print('Data already exists!')
                else:
                    details_res = requests.get(details_url,
                                               headers=self.headers,
                                               cookies=self.cookies).text
                    time.sleep(random.uniform(1, 5))
                    html_obj = etree.HTML(details_res)
                    source = int(6)
                    sourceUrl = details_url
                    jobId = time.time()
                    authorName = articles[i]['source']['name']
                    releaseTime = articles[i]['publishedAt']
                    title_source = articles[i]['title']
                    title = self.parsing_news_title(html=html_obj,
                                                    title_source=title_source)
                    thumbnail_img = articles[i]['urlToImage']
                    img = self.download_img(html=html_obj,
                                            thumbnail_img=thumbnail_img)
                    content = self.parsing_news_content(
                        content_html=details_res)
                    if content == 'Sign in to the BBC, or Register' or content is None or img is None or img == '':
                        pass
                    else:
                        data = {
                            'source': source,
                            'jobId': int(jobId),
                            'sourceUrl': sourceUrl,
                            'title': title,
                            'authorName': authorName,
                            'releaseTime': releaseTime,
                            'content': content,
                            'img': img
                        }
                        print('data:\n', data)
                        self.save.save_data(data=data, news='bbc')

    def parsing_news_title(self, html, title_source):
        title = ''.join(html.xpath('//h1[@class="story-body__h1"]/text()'))
        if title == '' or title is None:
            return title_source
        else:
            return title

    def parsing_news_content(self, content_html):
        text = fulltext(content_html).split('\n')
        txt = list(filter(lambda x: x.strip() != '', text))
        content = '<p>'.join(txt)
        return content

    def download_img(self, html, thumbnail_img):
        try:
            pic_list_1 = html.xpath(
                '//span[@class="image-and-copyright-container"]/img/@src')
            pic_list_2 = html.xpath(
                '//div[@class="js-delayed-image-load"]/@data-src')
            pic_list_3 = [i for i in pic_list_2 if '320' in i]
            pic_list = pic_list_1 + pic_list_3
            img_id = str(uuid.uuid4()).replace('-', '')
            index = 1
            img_list = []
            pic_url_list = [i for i in pic_list if 'png' not in i]
            if pic_url_list == []:
                urllib.request.urlretrieve(
                    thumbnail_img,
                    r'%s.jpg' % (self.downloadPath + self.picPath +
                                 str(img_id) + "-" + str(index)))
                img = r'%s.jpg' % (self.picPath + str(img_id) + "-" +
                                   str(index))
                return img
            else:
                for pic_url in pic_url_list:
                    if '320' in pic_url:
                        url = pic_url.replace("320", '660')
                        urllib.request.urlretrieve(
                            url,
                            r'%s.jpg' % (self.downloadPath + self.picPath +
                                         str(img_id) + "-" + str(index)))
                        img_list.append(
                            r'%s.jpg' %
                            (self.picPath + str(img_id) + "-" + str(index)))
                        index += 1
                    else:
                        urllib.request.urlretrieve(
                            pic_url,
                            r'%s.jpg' % (self.downloadPath + self.picPath +
                                         str(img_id) + "-" + str(index)))
                        img_list.append(
                            r'%s.jpg' %
                            (self.picPath + str(img_id) + "-" + str(index)))
                        index += 1
                img = ','.join(img_list)
                return img
        except:
            return None
コード例 #21
0
class ABC_News(object):
    def __init__(self):
        self.news_api = NewsApiClient(
            api_key='e7d5104fc5c74e259dbe2427b68257fb')
        self.key_word = [
            'U.S.', 'Lifestyle', 'Technology', 'Entertainment', 'Sports',
            'Health'
        ]
        self.t = time.time()
        self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t))
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            'cookieMonster=1; _cb_ls=1; SWID=522fc1e1-4ffd-4802-86fa-d7475f8dca57; optimizelyEndUserId=oeu1557122779744r0.05588715173473946; s_vi=[CS]v1|2E67E7728507B2CE-4000011580009D3A[CE]; __gads=ID=b52dc242d5ad893e:T=1557122797:S=ALNI_MaWnnuLLKP88qrPEsPEVuViaoGlJg; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; _cb=Dz5K21B0CX5JDkjMG; _v__chartbeat3=DbUUrKDDGTaEDqauaQ; _cb_svref=null; AkamaiAnalytics_BrowserSessionId=4d41ba72-5ab2-8f33-46aa-f5748aca9647; HTML_VisitIntervalStartTime=1557125661015; s_sess=%20s_cc%3Dtrue%3B%20s_sq%3D%3B; adnum=3undefined; _chartbeat2=.1557122809880.1557125676423.1.DoKBmGC2OW5QDWek5PEERi8oZtYZ.12; HTML_BitRateBucketCsv=0,19083,16715,0,0,0,0,0; HTML_VisitValueCookie=1|1|1|0|35798|35826|0|0|0|0|0|0|NaN; s_pers=%20s_fid%3D22C0AF24132A0778-001FDBB2DA7591AD%7C1620284117936%3B%20s_c20%3D1557125717941%7C1651733717941%3B%20s_c20_s%3DFirst%2520Visit%7C1557127517941%3B; HTML_isPlayingCount=2; GED_PLAYLIST_ACTIVITY=W3sidSI6IlhYU0wiLCJ0c2wiOjE1NTcxMjU3MzgsIm52IjowLCJ1cHQiOjE1NTcxMjU1NDQsImx0IjoxNTU3MTI1NjM3fSx7InUiOiIzbnlXIiwidHNsIjoxNTU3MTI1NzM3LCJudiI6MCwidXB0IjoxNTU3MTI1NTU2LCJsdCI6MTU1NzEyNTYyM30seyJ1IjoiWG81TSIsInRzbCI6MTU1NzEyNTczNywibnYiOjEsInVwdCI6MTU1NzEyNTU4NSwibHQiOjE1NTcxMjU3MzV9XQ..; HTML_VisitCountCookie=1'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/abc_news/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        self.parsing_abc_news_list()

    def parsing_abc_news_list(self):
        today = self.point_time
        for kw in self.key_word:
            print('keyword:\t', kw)
            news_list = self.news_api.get_everything(
                q=kw,
                sources='abc-news',
                domains='abcnews.go.com',
                from_param=today,
                to=today[:-1] + str(int(today[-1]) - 1),
                language='en',
                sort_by='relevancy',
                page_size=100,
            )
            self.parsing_news_list_url(news_list=news_list)

    def parsing_news_list_url(self, news_list):
        articles = news_list['articles']
        for i in range(len(articles)):
            details_url = articles[i]['url']
            result = self.filter.filter_data(details_url=details_url)
            if result:
                print('Data already exists!')
            else:
                time.sleep(random.uniform(1, 3))
                details_res = requests.get(details_url,
                                           headers=self.headers,
                                           cookies=self.cookies).text
                html_obj = etree.HTML(details_res)
                source = int(2)
                sourceUrl = details_url
                jobId = time.time()
                authorName = articles[i]['source']['name']
                releaseTime = articles[i]['publishedAt']
                title_source = articles[i]['title']
                title = self.parsing_news_title(html_obj=html_obj,
                                                title_source=title_source)
                content = self.parsing_news_content(content_html=details_res,
                                                    html_obj=html_obj,
                                                    newspaper=True)
                thumbnail_img = articles[i]['urlToImage']
                img = self.download_img(html_obj=html_obj,
                                        thumbnail_img=thumbnail_img)
                if img is None or img == '' or content is None or content == '':
                    pass
                else:
                    data = {
                        'source': source,
                        'jobId': int(jobId),
                        'sourceUrl': sourceUrl,
                        'title': title,
                        'authorName': authorName,
                        'releaseTime': releaseTime,
                        'content': content,
                        'img': img
                    }
                    print('data:\n', data)
                    self.save.save_data(data=data, news='abc')

    def parsing_news_title(self, html_obj, title_source):
        title = ''.join(
            html_obj.xpath('//header[@class="article-header"]//h1/text()'))
        if title == '' or title is None:
            return title_source
        else:
            return title

    def parsing_news_content(self,
                             content_html=None,
                             html_obj=None,
                             newspaper=False):
        if newspaper:
            text = fulltext(content_html).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
        else:
            content_list = html_obj.xpath(
                '//div[@id="news-content"]//p/text()')
            content = '<p>'.join([
                i.replace("\n", '').strip() for i in content_list
            ]).replace("<p><p>", '<p>')
        return content

    def download_img(self, html_obj, thumbnail_img):
        try:
            pic_url_list = html_obj.xpath('//figure//div//picture//img/@src')
            img_id = str(uuid.uuid4()).replace('-', '')
            index = 1
            img_list = []
            if pic_url_list == []:
                urllib.request.urlretrieve(
                    thumbnail_img,
                    r'%s.jpg' % (self.downloadPath + self.picPath +
                                 str(img_id) + "-" + str(index)))
                img = r'%s.jpg' % (self.picPath + str(img_id) + "-" +
                                   str(index))
                return img
            else:
                for pic_url in pic_url_list[:17]:
                    urllib.request.urlretrieve(
                        pic_url,
                        r'%s.jpg' % (self.downloadPath + self.picPath +
                                     str(img_id) + "-" + str(index)))
                    img_list.append(
                        r'%s.jpg' %
                        (self.picPath + str(img_id) + "-" + str(index)))
                    index += 1
                img = ','.join(img_list)
                return img
        except:
            pass
コード例 #22
0
class Smart_News():
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/smartNews/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        index = 1
        while index < 3:
            url_dic = {
                'news':
                'https://www.smithsonianmag.com/category/smart-news/?no-ist%252F=938&page={}'
                .format(index),
                'history':
                'https://www.smithsonianmag.com/category/history/?page={}'.
                format(index),
                'science':
                'https://www.smithsonianmag.com/category/science-nature/?page={}'
                .format(index),
                'innovation':
                'https://www.smithsonianmag.com/category/innovation/?page={}'.
                format(index),
                'arts_culture':
                'https://www.smithsonianmag.com/category/arts-culture/?page={}'
                .format(index),
                'travel':
                'https://www.smithsonianmag.com/category/travel/?page={}'.
                format(index),
                'smithsonian':
                'https://www.smithsonianmag.com/category/smithsonian-institution/?page={}'
                .format(index),
            }
            for kw in url_dic:
                print('keyword:\t', kw)
                self.parsing_smart_news_list_page(url_dic[kw])
            index += 1

    def parsing_smart_news_list_page(self, list_url):
        time.sleep(random.uniform(5, 10))
        list_res = requests.get(url=list_url, headers=self.headers).text
        html = etree.HTML(list_res)
        thumbnail_img_list = html.xpath('//main[@class="main"]//img//@src')
        details_url_list = html.xpath('//h3[@class="headline"]//a/@href')
        for i in range(len(details_url_list)):
            result = self.filter.filter_data(
                details_url='https://www.smithsonianmag.com' +
                details_url_list[i])
            if result:
                print('Data already exists!')
            else:
                data = self.parsing_details_page(
                    details_url='https://www.smithsonianmag.com' +
                    details_url_list[i],
                    thumbnail_img=thumbnail_img_list[i])
                if data is None or data == '':
                    pass
                else:
                    print('data:\n', data)
                    self.save.save_data(data=data, news='smart')

    def parsing_details_page(self, details_url, thumbnail_img):
        time.sleep(random.uniform(3, 5))
        details_res = requests.get(url=details_url, headers=self.headers).text
        details_html = etree.HTML(details_res)
        source = int(5)
        sourceUrl = details_url
        jobId = time.time()
        title = ''.join(details_html.xpath('//h1[@class="headline"]/text()'))
        if title is None or title == '':
            pass
        else:
            text = fulltext(details_res).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
            author = details_html.xpath('//a[@class="author-name"]/text()')
            authorName = ''.join(
                [i.replace("/n", '<p>').strip() for i in author])
            releaseTimeList = details_html.xpath(
                '//time[@class="pub-date"]/text()')
            releaseTime = ''.join(
                [i.replace("/n", '<p>').strip() for i in releaseTimeList])
            img = self.analysis_filter_img_url(html=details_html,
                                               thumbnail_img=thumbnail_img)
            if img is None or img == '' or content is None or content == '':
                pass
            else:
                return {
                    'source': source,
                    'jobId': int(jobId),
                    'sourceUrl': sourceUrl,
                    'title': title,
                    'authorName': authorName,
                    'releaseTime': releaseTime,
                    'content': content,
                    'img': img
                }

    def analysis_filter_img_url(self, html, thumbnail_img):
        href_list = html.xpath('//main[@class="main"]//img//@src')
        pic_url_list = [
            i for i in href_list if 'filer' in i and 'png' not in i
        ]
        img = self.download_img(pic_url_list=pic_url_list)
        if img == '' or img is None:
            img = self.download_img(pic_url_list=[thumbnail_img])
            return img
        else:
            return img

    def download_img(self, pic_url_list):
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        try:
            for pic_url in pic_url_list[:17]:
                if '220x130' in pic_url or '60x60' in pic_url:
                    pass
                else:
                    response = requests.get(pic_url)
                    image = Image.open(BytesIO(response.content))
                    image.save(r'%s.jpg' % (self.downloadPath + self.picPath +
                                            str(img_id) + "-" + str(index)))
                    img_list.append(
                        r'%s.jpg' %
                        (self.picPath + str(img_id) + "-" + str(index)))
                    index += 1
            img = ','.join(img_list)
            return img
        except:
            return None
コード例 #23
0
class FOX_News(object):
    def __init__(self):
        self.news_api = NewsApiClient(api_key='f04f7a8db32841299d4a7fae723e61b2')
        self.t = time.time()
        self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t))
        self.keyword = ['us', 'word', 'opinion', 'politics', 'entertainment', 'lifestyle', 'health', 'travel', 'autos']
        self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        self.cookies = {'cookie': '_cb_ls=1; optimizelyEndUserId=oeu1556269407120r0.4256555044820445; cto_lwid=a3569f8e-fd62-48fd-8cf3-52e3a3d49218; _gcl_au=1.1.1392012605.1556269408; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22cfa5a6d1-cac6-4a48-97ed-e2a25488a94a%22; _ga=GA1.2.353904812.1556269412; _cb=D6-ViRhsUuoBSGama; __gads=ID=0a226a472ca026e8:T=1556269422:S=ALNI_Mb8qEqiRmqgHFem87cBOSEiCTTaJQ; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _scid=47caae3a-e216-48d9-8cdc-3159238a7671; FXN_flk=1; AMCVS_17FC406C5357BA6E0A490D4D%40AdobeOrg=1; _gid=GA1.2.1114110874.1557801782; s_cc=true; _csrf=qWmVWRGxKfzqXCxI9_yuGfZI; s_sq=%5B%5BB%5D%5D; AKA_A2=A; ak_bmsc=3362DC65CD8C5F6FE2F5F2E24D7DD7FE6876060DED3200004C65DA5C0E141B34~pl9V8ncmx0JI/913nUJgfYoKX6Gte64URfMw4gBpTiaQPEzpKVnyOxRIc/NBeHS9HwdJZ+Fd5cB6oDFLpRNLt93qTu4fSjWuP7e+PZea5EArlAr63c0rHI5P+U7hKycyZfvpMt2MSsmqLqtUqZqavEQxBprGj74WIJ0a5ZnH2vSP1CYH+4ijzZPqw/REPx+WlZ+jHCptyFj7C9pjBHstMpWmr4RW6NTHMwyBsckJbiQr0p+5gPNq/FUjz06HN7q/b4; _cb_svref=null; AMCV_17FC406C5357BA6E0A490D4D%40AdobeOrg=2121618341%7CMCIDTS%7C18031%7CMCMID%7C37985443320715041480395091296536963184%7CMCAAMLH-1557842971%7C7%7CMCAAMB-1558421455%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1557823855s%7CNONE%7CMCAID%7CNONE; s_pers=%20s_ppn%3Dfnc%253Aroot%253Aroot%253Achannel%7C1557806491239%3B%20omtr_lv%3D1557816723185%7C1652424723185%3B%20omtr_lv_s%3DLess%2520than%25201%2520day%7C1557818523185%3B%20s_nr%3D1557816723191-Repeat%7C1560408723191%3B; _chartbeat2=.1556269420027.1557816723254.0000000010000001.CY0VlWCO9QUgDFFbO8QLxoyCPj7ho.2; s_sess=%20omtr_evar17%3DD%253Dc17%3B%20s_ppvl%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C22%252C83%252C5886%252C1920%252C925%252C1920%252C1080%252C1%252CL%3B%20SC_LINKS%3D%3B%20s_ppv%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C63%252C96%252C3550%252C1920%252C969%252C1920%252C1080%252C1%252CL%3B; criteo_write_test=ChUIBBINbXlHb29nbGVSdGJJZBgBIAE; bm_sv=8A6F070ED17B9F85AD022D562A830573~oN82OtrVhgL99OXQYjpsFWPKOuwBoUVwy60qge23Kx9pNN2MIe3/AhQZJZ+na42MjDAIyCRuvDS6csM6csNzVnCY/0Ue7dXJIHzFvEjq/KcL+5X57fiZK5b9W/W3g/hw1kSCvVxA/GNO4h9IlDmY6OElMgVSqN2h9kq42m6z+n0='}
        self.downloadPath = '/data/crawler'
        self.picPath = '/fox_news/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()



    def run(self):
        self.parsing_fox_news_list()


    def parsing_fox_news_list(self):
        today = self.point_time
        for kw in self.keyword:
            print('keyword:\t', kw)
            news_list = self.news_api.get_everything(q=kw,
                                                    sources='fox-news',
                                                    domains='foxnews.com',
                                                    from_param=today,
                                                    to=today[:-1] + str(int(today[-1]) - 1),
                                                    language='en',
                                                    sort_by='relevancy',
                                                    page_size=100, )
            self.parsing_fox_news_list_url(news_list=news_list)


    def parsing_fox_news_list_url(self, news_list):
        articles = news_list['articles']
        for i in range(len(articles)):
            details_url = articles[i]['url']
            result = self.filter.filter_data(details_url=details_url)
            if result:
                print('Data already exists!')
            else:
                details_res = requests.get(details_url, headers=self.headers, cookies=self.cookies).text
                time.sleep(random.uniform(1, 3))
                html_obj = etree.HTML(details_res)
                source = int(9)
                sourceUrl = details_url
                jobId = time.time()
                author = articles[i]['source']['name']
                authorName = self.parsing_author_name(html_obj=html_obj, name_source=author)
                releaseTime = articles[i]['publishedAt']
                title = articles[i]['title']
                content = self.parsing_news_content(content_html=details_res, html_obj=html_obj, newspaper=True)
                thumbnail_img = articles[i]['urlToImage']
                img = self.download_img(html_obj=html_obj, thumbnail_img=thumbnail_img)
                if img is None or img == '' or content is None or content == '':
                    pass
                else:
                    data = {'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName,
                            'releaseTime': releaseTime, 'content': content, 'img': img}
                    print('data:\n', data)
                    self.save.save_data(data=data, news='fox')


    def parsing_author_name(self, html_obj, name_source):
        authorName = ''.join(html_obj.xpath('//div[@class="author-byline"]//span/span//text()'))
        if authorName == '' or authorName is None:
            return name_source
        else:
            return authorName


    def parsing_news_content(self, content_html=None, html_obj=None, newspaper=False):
        try:
            if newspaper:
                text = fulltext(content_html).split('\n')
                txt = list(filter(lambda x: x.strip() != '', text))
                txt_list = []
                for i in txt:
                    if i.isupper():
                        pass
                    else:
                        txt_list.append(i)
                content = '<p>'.join(txt_list)
            else:
                content_list = html_obj.xpath('//div[@class="article-body"]//p//text()')
                content = '<p>'.join([i.replace("\n", '').strip() for i in content_list]).replace("<p><p>", '<p>')
            return content
        except:
            pass


    def download_img(self, html_obj, thumbnail_img):
        pic_url_list = html_obj.xpath('//div[@class="article-body"]//img/@src')
        img_id = str(uuid.uuid4()).replace('-','')
        index = 1
        img_list = []
        try:
            if pic_url_list == []:
                urllib.request.urlretrieve(thumbnail_img, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index)))
                img = r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))
                return img
            else:
                for pic_url in pic_url_list[:17]:
                    urllib.request.urlretrieve(pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index)))
                    img_list.append(r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)))
                    index += 1
                img = ','.join(img_list)
                return img
        except:
            return None
コード例 #24
0
ファイル: medium_news.py プロジェクト: zhangpeng0v0/news
class Medium_News():
    def __init__(self):
        self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
        self.cookies = {'cookie': '__cfduid=d6ba6448200002747444269a19593dbdd1555908016; __cfruid=985eba5fa2a449247bfd0598c1c1c5ec968a9416-1558490711; _ga=GA1.2.1064338879.1558490714; _gid=GA1.2.136804405.1558490714; lightstep_guid/medium-web=8f0cd65b0ef4abdb; lightstep_session_id=fcac5cf910466bc4; pr=1; tz=-480; uid=3314454e53ae; sid=1:4N4F93p0H1gPvFCIGldZUdIdQeFiifNF6stzqPFyBikCsGpjcmnIyu/NNWwIVVTx; xsrf=89TsRPcZaZKu; lightstep_guid/lite-web=7d9b16045b97b840; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://medium.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1558512703778%2C%22slts%22:1558503751909}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:3%2C%22last_session_ts%22:1558512703778}; sz=1905'}
        self.downloadPath = '/data/crawler'
        self.picPath = '/huffpost/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()


    def run(self):
        news_dict = {
            'topic':'https://medium.com/topic/editors-picks',
            'technology':'https://medium.com/topic/technology',
            'startups':'https://medium.com/topic/startups',
            'self':'https://medium.com/topic/self',
            'politics':'https://medium.com/topic/politics',
            'health':'https://medium.com/topic/health',
            'design':'https://medium.com/topic/design',
            'art':'https://medium.com/topic/art',
            'beauty':'https://medium.com/topic/beauty',
            'humor':'https://medium.com/topic/humor',
            'fiction':'https://medium.com/topic/fiction',
            'media':'https://medium.com/topic/social-media',
            'crime':'https://medium.com/topic/true-crime',
            # 'comics':'https://medium.com/topic/comics',
        }
        for i in news_dict:
            self.parsing_medium_topic_list_page(url=news_dict[i])

        news_list = {
            'elemental':'https://medium.com/elemental-by-medium',
            'heated':'https://heated.medium.com/',
            'human':'https://medium.com/human-parts',
        }
        for j in news_list:
            self.parsing_medium_other_list_page(url=news_list[j])


    def parsing_medium_topic_list_page(self, url):
        html = requests.get(url=url, headers=self.headers, cookies=self.cookies).text
        time.sleep(random.uniform(3 ,5))
        html_script = r'<script>(.*?)</script>'
        script = re.findall(html_script, html, re.S | re.M)
        mediumUrl_rule = r'"mediumUrl":"(.*?)"'
        rule = re.compile(mediumUrl_rule)
        result = rule.findall(script[4])
        for i in result:
            details_url = i.replace(r'\u002F', '/')
            self.parsing_details_page(details_url=details_url)



    def parsing_medium_other_list_page(self, url):
        res = requests.get(url=url, headers=self.headers, cookies=self.cookies).text
        html = etree.HTML(res)
        list_page_urls = html.xpath('//div[@class="u-lineHeightBase postItem"]/a/@href')
        for details_url in list_page_urls:
            self.parsing_details_page(details_url=details_url)


    def parsing_details_page(self, details_url):
        status = self.filter.filter_data(details_url=details_url)
        if status:
            print('Data already exists!')
        else:
            res = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text
            time.sleep(random.uniform(1, 3))
            html = etree.HTML(res)
            source = int(8)
            jobId = time.time()
            sourceUrl = details_url
            title =''.join(html.xpath('//div[@class="section-content"]//h1//text()'))
            if title == '' or title is None:
                pass
            else:
                authorName = ''.join(html.xpath('//div[@class="u-paddingBottom3"]/a/text()'))
                releaseTime = ''.join(html.xpath('//time/text()'))
                content = self.analysis_news_content(html=res, obj=html, newspaper=False)
                img = self.analysis_news_img(obj=html)
                if img is None or img == '' or content is None or content == '':
                    pass
                else:
                    data = {'source': source,'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName,
                            'releaseTime': releaseTime, 'content': content, 'img': img}
                    print('data:\n', data)
                    self.save.save_data(data=data, news='medium')


    def analysis_news_content(self, html, obj, newspaper=True):
        if newspaper:
            text = fulltext(html).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
            return content
        else:
            content_list= obj.xpath('//div[@class="section-content"]//text()')[7:]
            content = '<p>'.join([i.replace("\n", '').strip() for i in content_list]).replace("<p><p>", '<p>')
            return content


    def analysis_news_img(self, obj):
        try:
            pic_url_list = obj.xpath('//img[@class="progressiveMedia-image js-progressiveMedia-image"]/@data-src')
            img_id = str(uuid.uuid4()).replace('-', '')
            index = 1
            img_list = []
            if pic_url_list == []:
                return None
            else:
                for pic_url in pic_url_list[:17]:
                    response = requests.get(pic_url)
                    image = Image.open(BytesIO(response.content))
                    image.save(r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index)))
                    img_list.append(r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)))
                    index += 1
                img = ','.join(img_list)
                return img
        except:
            return None
コード例 #25
0
class Techcrunch_News(object):
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            'rxx=293dmvskaws.1i9frk99&v=1; _ga=GA1.2.893461412.1556504122; _fbp=fb.1.1556504132562.1028440015; __pnahc=0; __pat=-25200000; OTH=v=1&d=eyJraWQiOiIwMTY0MGY5MDNhMjRlMWMxZjA5N2ViZGEyZDA5YjE5NmM5ZGUzZWQ5IiwiYWxnIjoiUlMyNTYifQ.eyJjdSI6eyJndWlkIjoiUDJCVVRPR0RVT0VGVERUV0pNTFdKNFlFSDQiLCJzaWQiOiJEYTNsbjdKbG11MmwifX0.HoiBv5OlQvNY2x6q-LJBN-VzgCErT7GTCnODqLLQ8foasqTVUCPVXvwHFniFc7CwCf0n7lmSgfrSycQNevIFSJHZ7M-S9SRQH4FMtu91qykbuvAzAOQZRw_iz_warZWFJtpIys0EVH4Gn9wYqaqLXv-5lO39fuPsqJx9z7X6luQ; BX=e7bndb9eccnhl&b=4&d=lrdDlyNpYELw7nQr45ylAA--&s=8v&i=_g5gVQJJ34nc.9WZ3JGN; GUC=AQEAAQJc0SBdu0IgxgTZ&s=AQAAAJQ2Kk5V&g=XM_RDg; __tbc=%7Bjzx%7DjGAToaZMxJYLoS7N4KRjDaxHalABoj31MSFHkZP0UNxHLBrPMu6clUAaZwsaHnUnQQaMDnEIRO1fDpAMrkMVflCNhUFWsFFB8n1hsUBhKEKL38bZEAUprS1G6wPj4GNM4bchi9l7YPvr6or9wrNMLmWzw2hPXY5j7UVUWDOUH_U; __pcvc={}; _parsely_session={%22sid%22:6%2C%22surl%22:%22https://techcrunch.com/2019/05/04/uber-is-facing-australian-class-action-suit-alleging-unlawful-conduct/%22%2C%22sref%22:%22%22%2C%22sts%22:1558071232131%2C%22slts%22:1557885616789}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:6%2C%22last_session_ts%22:1558071232131}; cmp=t=1558071232&j=0; _gid=GA1.2.1358424281.1558071235; _gat=1; __adblocker=false; xbc=%7Bjzx%7DYW6Rlvft6bPCfQyJ3DedvFReFNeSWzD34uqjUgyftdmRMMeJaQrGxlc0RnHslaNJuW923ovrMyh3fAAIY_x7R_Da15zP9YopEn3Om90NI0T5GRkVz40I1R8zV8ZQB68kBF2YuF_JsLshS1YKLFcyLSN12KbxNP4vrnBqkqtIO2yaJ5LoTRrcAPA64ePs4VtlokVTqGlotnhRSiMBSeplyP6M0a5Lj5rCIn1GIetfFxi-gIZuaMlkdAHSSmrqD1nfLBrQXcHSWrDRR0PGzzVvFjSVEXhIbldyChWDeAkkgN0hgI8KXA304yID8T-gx9UZiwWN897EFpRv3ZNtbg5IqW5GixrDYN1X7y_FdQGe5c4Tlz-figdB5Mbe5Qj2godX23QAk9Y6PbNudCC8Em1tgOzteL0CnIShQ--XvwA9qsvEZSWlAxrGfFmStXYiVaTRc1BM1DSemqPeEIoI_XtXT1h-FOYTaDZfqgflEl3Qb8MlWCowztRcnRznul-OxLIUMkPAraljlm83Bs9Z0ZZTeULOzew-rPTbrZfnXeQjr8OJtrUbNexMaJib654rgmNL7kXPxNmVdB1ZWX5IXEgmiW4XKjZACr0RxZbzXhXFfEN9gPbI7xVJJD8kfmfWoGW_0O6MebIrRbW8xxFPLY90Mw; __pvi=%7B%22id%22%3A%22v-2019-05-17-13-33-55-875-qtmTHL61BOYny2co-1510a80d282f15b71b1e5f4d8bc358ee%22%2C%22domain%22%3A%22.techcrunch.com%22%2C%22time%22%3A1558072171993%7D'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/techcrunch/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        pg = 1
        while pg < 3:
            start_url = 'https://techcrunch.com/wp-json/tc/v1/magazine?page={}&_embed=true'.format(
                pg)
            self.parsing_tc_news_list_url(list_url=start_url)
            pg += 1

    def parsing_tc_news_list_url(self, list_url):
        res = requests.get(url=list_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(1, 3))
        article = json.loads(res)
        for i in range(len(article)):
            sourceUrl = article[i]["link"]
            status = self.filter.filter_data(details_url=sourceUrl)
            if status:
                pass
            else:
                releaseTime = article[i]["date_gmt"]
                header_info = {
                    'sourceUrl': sourceUrl,
                    'releaseTime': releaseTime
                }
                self.parsing_details_page(header_info=header_info)

    def parsing_details_page(self, header_info):
        jobId = time.time()
        source = int(14)
        sourceUrl = header_info['sourceUrl']
        releaseTime = header_info['releaseTime']
        res = requests.get(url=sourceUrl,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(1, 3))
        html = etree.HTML(res)
        title = ''.join(html.xpath('//h1[@class="article__title"]/text()'))
        authorName = ''.join(
            html.xpath('//div[@class="article__byline"]/a/text()')).strip()
        content = self.parsing_news_content(content_html=res)
        img = self.download_img(html_obj=html)
        if img is None or img == '' or content is None or content == '':
            pass
        else:
            data = {
                'source': source,
                'jobId': int(jobId),
                'sourceUrl': sourceUrl,
                'title': title,
                'authorName': authorName,
                'releaseTime': releaseTime,
                'content': content,
                'img': img
            }
            print('data:\n', data)
            self.save.save_data(data=data, news='techcrunch')

    def parsing_news_content(self, content_html=None):
        text = fulltext(content_html).split('\n')
        txt = list(filter(lambda x: x.strip() != '', text))
        content = '<p>'.join(txt)
        return content

    def download_img(
        self,
        html_obj,
    ):
        pic_url_list = html_obj.xpath(
            '//article[@class="article-container article--post "]//img/@src')
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        if pic_url_list == []:
            return None
        else:
            for pic_url in pic_url_list[:17]:
                urllib.request.urlretrieve(
                    pic_url, r'%s.jpg' % (self.downloadPath + self.picPath +
                                          str(img_id) + "-" + str(index)))
                img_list.append(
                    r'%s.jpg' %
                    (self.picPath + str(img_id) + "-" + str(index)))
                index += 1
            img = ','.join(img_list)
            return img
コード例 #26
0
ファイル: matador_network.py プロジェクト: zhangpeng0v0/news
class Matador_Network(object):
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            '_ga=GA1.2.1006188425.1558506407; __auc=c07ed86116ade38958b6f215c90; __gads=ID=ebab27bbe751d3d9:T=1558506409:S=ALNI_MbKANVLVZlZmub7wcHXVdRq__9uAQ; _fbp=fb.1.1558506413908.2064431331; cache-primed=1; mn-push-status=8; EU=(null); _gid=GA1.2.549309516.1558921947; __asc=2b5b28f116af808ea5c6cf504f0'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/matador_network/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        pg = 8
        while pg < 68:
            start_url = 'https://matadornetwork.com/wp-content/plugins/matadornetwork/mn-ajax.php?component=post&action=get_posts&' \
                        'offset={}' \
                        '&posts_per_page=20&grid=small&post__not_in%5B%5D=546093&post__not_in%5B%5D=%20546069&post__not_in%5B%5D=%20545872&post__not_in%5B%5D=%20497520&post__not_in%5B%5D=%20501737&post__not_in%5B%5D=%20486578&post__not_in%5B%5D=%20342847&home=1&_=1558941893778'.format(pg)
            self.parsing_matador_network_list_page(list_url=start_url)
            pg += 20

    def parsing_matador_network_list_page(self, list_url):
        res = requests.get(url=list_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        js = json.loads(res)
        html = js['html']
        html_obj = etree.HTML(html)
        urls_list = html_obj.xpath(
            '//a[@class="article__image-wrapper"]/@href')
        for details_url in urls_list:
            status = self.filter.filter_data(details_url=details_url)
            if status:
                print('Data already exists!')
            else:
                data = self.parsing_details_page_url(details_url=details_url)
                print('data:\t', data)
                self.save.save_data(data=data, news='matador')

    def parsing_details_page_url(self, details_url):
        res = requests.get(url=details_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        html = etree.HTML(res)
        time.sleep(random.uniform(1, 3))
        source = int(11)
        sourceUrl = details_url
        jobId = time.time()
        title = ''.join(html.xpath('//div[@class="container"]//h1/text()'))
        releaseTime = ''.join(
            html.xpath('//div[@class="post-info-date"]/text()'))
        authorName = ''.join(
            html.xpath('//a[@class="post-info-author"]/text()'))
        content = self.analysis_news_content(content_html=res,
                                             html_obj=html,
                                             newspaper=True)
        img = self.analysis_content_img(html_obj=html)
        if img is None or img == '' or content is None or content == '':
            pass
        else:
            return {
                'source': source,
                'jobId': int(jobId),
                'sourceUrl': sourceUrl,
                'title': title,
                'authorName': authorName,
                'releaseTime': releaseTime,
                'content': content,
                'img': img
            }

    def analysis_news_content(self,
                              content_html=None,
                              html_obj=None,
                              newspaper=True):
        if newspaper:
            text = fulltext(content_html).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
        else:
            content_list = html_obj.xpath(
                '//div[@class="post-content"]//text()')
            content = '<p>'.join([
                i.replace("\n", '').strip() for i in content_list
            ]).replace("<p><p>", '<p>')
        return content

    def analysis_content_img(self, html_obj):
        pic_url_list = html_obj.xpath('//div[@class="container"]//img/@src')
        pic_set = [i for i in pic_url_list if '.png' not in i]
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        if pic_set == []:
            return None
        else:
            for pic_url in pic_set:
                response = requests.get(pic_url)
                image = Image.open(BytesIO(response.content))
                image.save(r'%s.jpg' % (self.downloadPath + self.picPath +
                                        str(img_id) + "-" + str(index)))
                img_list.append(
                    r'%s.jpg' %
                    (self.picPath + str(img_id) + "-" + str(index)))
                index += 1
            img = ','.join(img_list)
            return img
コード例 #27
0
ファイル: buzzfeed_news.py プロジェクト: zhangpeng0v0/news
class Buzz_Feed_News():
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            '_ga=GA1.2.2006098489.1555559856; _fbp=fb.1.1555559860721.1190642659; __qca=P0-464700868-1555559857580; permutive-id=35526ebd-337f-4b00-bf5b-10a6610a85a5; __gads=ID=2cb4be529258fba6:T=1555559912:S=ALNI_MawMBKcEjsbSC3roAOVcCQm5lCB2A; _pdfps=%5B7684%2C13160%2C13164%2C13319%2C13730%2C14474%2C10166%2C12448%2C12449%2C12882%2C13097%2C13214%2C13217%2C13276%2C13278%2C13834%2C14353%2C10748%2C10788%2C13102%2C13144%2C13145%2C13146%2C13147%2C13150%2C13151%2C13157%2C13163%2C13169%2C13667%2C14437%2C14458%2C10224%2C10915%2C13153%2C13675%2C14142%2C13064%2C13216%2C13279%2C14431%2C14432%2C10749%2C10789%2C10906%2C10916%2C10917%2C11655%2C12233%2C12244%2C12679%2C12985%2C13099%2C13101%2C13148%2C13244%2C13741%2C13742%2C14143%2C14479%2C14872%2C15077%2C15128%2C15139%2C10222%2C13100%2C10216%2C%2212244-15-22969%22%2C%2212244-15-22970%22%2C%2212679-5-118997US%22%2C%2212985-5-118497US%22%2C%2213244-5-325997US%22%2C%2213245-5-325997US%22%2C%2213246-5-325997US%22%2C%2213458-15-22969%22%2C%2213458-15-22970%22%2C%2213459-15-22969%22%2C%2213459-15-22970%22%2C%2214229-5-318346US%22%2C%2214351-15-22835%22%2C%2214479-5-325547US%22%2C%2214872-15-22835%22%2C%2214872-15-22814%22%2C%2215063-5-318346US%22%2C%2215063-5-325346US%22%5D; permutive-session=%7B%22session_id%22%3A%2215c3b65a-580a-47aa-ab95-b44076421376%22%2C%22last_updated%22%3A%222019-04-27T02%3A27%3A38.962Z%22%7D; _cmpQcif3pcsupported=1; _gid=GA1.2.13310005.1557196067; _gat=1; sailthru_pageviews=4; sailthru_content=cbe347ea3dd8f028b2a79dd2124b2609d73dc57549ee138bd1d9dedee18e797c3cde4668fc0929097a33767e5b408948300e3683df34cf01dce50805bbb1306ce0bce460f7e70fed288b52d84bd9816499693f0167a253c9d1ba851de3a9d8e9dd7ae6730eff39df6f3b2fee47cae2908e3260668e0361ea9bd2ebb68e2a0591e9ec864cd274cc1d8b3a98016c2bcf1d874e57a78b55d2f981aeb6d2c79bfecc9d43236330abbff1afd96b7ffa626bb4936065bb0196c7181b628021dea483cf13a2f044347925f429d5fbc7008162c9cd736b79ca68d62341101204bca0cca1ff22ee54be7fa316d48db768db05dda4f044956926b209e90497a64953e290f7; sailthru_visitor=a057d87e-b51c-4f1e-9167-d146c2a3a7bc'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/buzzfeed/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        page = 1
        while page < 3:
            news_list_url = 'https://www.buzzfeednews.com/site-component/v1/en-us/trending-on-buzzfeednews?page={}&page_size=10'.format(
                page)
            self.parsing_buzzFeed_news_list(news_list_url=news_list_url)
            time.sleep(random.uniform(60, 70))
            page += 1

    def parsing_buzzFeed_news_list(self, news_list_url):
        res = requests.get(url=news_list_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        item = json.loads(res)
        results = item['results']
        for i in range(len(results)):
            jobId = time.time()
            source = int(3)
            sourceUrl = results[i]['url']
            filter_data = self.filter.filter_data(details_url=sourceUrl)
            if filter_data:
                print('Data already exists!')
            else:
                title = results[i]['name']
                releaseTime = results[i]['created_at']
                thumbnail_img = results[i]['image']
                article = self.parsing_details_url(details_url=sourceUrl,
                                                   thumbnail_img=thumbnail_img)
                if article == '' or article is None:
                    pass
                else:
                    data = {
                        'source': source,
                        'jobId': int(jobId),
                        'sourceUrl': sourceUrl,
                        'title': title,
                        'authorName': article['authorName'],
                        'releaseTime': releaseTime,
                        'content': article['content'],
                        'img': article['img']
                    }
                    print('data:\n', data)
                    self.save.save_data(data=data, news='buzzfeed')

    def parsing_details_url(self, details_url, thumbnail_img):
        time.sleep(random.uniform(2, 5))
        html = requests.get(url=details_url,
                            headers=self.headers,
                            cookies=self.cookies).text
        html_obj = etree.HTML(html)
        authorName = ''.join(
            html_obj.xpath(
                '//span[@class="news-byline-full__name xs-block link-initial--text-black"]/text()'
            ))
        content_list = html_obj.xpath(
            '//div[@data-module="article-wrapper"]//p//text()')
        content = '<p>'.join([
            i.replace("\n", '').strip() for i in content_list
        ]).replace("<p><p>", '<p>')
        if content == '' or content is None:
            pass
        else:
            img = self.download_img(html=html_obj, thumbnail_img=thumbnail_img)
            if img == '' or img is None:
                return None
            else:
                article = {
                    'authorName': authorName,
                    'content': content,
                    'img': img
                }
                return article

    def download_img(self, html, thumbnail_img):
        pic_list_1 = html.xpath('//figure//img/@data-src')
        pic_list_2 = html.xpath('//picture//img/@src')
        pic_list_3 = [i for i in pic_list_1 if i not in pic_list_2]
        pic_url_list = pic_list_2 + pic_list_3
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        if pic_url_list == []:
            urllib.request.urlretrieve(
                thumbnail_img, r'%s.jpg' % (self.downloadPath + self.picPath +
                                            str(img_id) + "-" + str(index)))
            img = r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))
            return img
        else:
            for pic_url in pic_url_list[:17]:
                urllib.request.urlretrieve(
                    pic_url, r'%s.jpg' % (self.downloadPath + self.picPath +
                                          str(img_id) + "-" + str(index)))
                img_list.append(
                    r'%s.jpg' %
                    (self.picPath + str(img_id) + "-" + str(index)))
                index += 1
            img = ','.join(img_list)
            return img
コード例 #28
0
ファイル: nypost_news.py プロジェクト: zhangpeng0v0/news
class New_York_Post_news(object):
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            'optimizelyEndUserId=oeu1555918735298r0.6812295616493853; _ga=GA1.2.501461025.1557229334; __pnahc=0; __tbc=%7Bjzx%7DbOREsfUR6SMcRp5niCu4XyJqGIm9xLbU2svbGCB3e5Y-ZlpcwIXRF_gOx5ssrGMlRCzVeeO-JA50xgthIobqDMJS2og0GbDQCa7bPklPxk1yFokaLXVHvRa0s4J7s817Uqt8s09tJ4GcmUzNoGeVhA; __pat=-14400000; __gads=ID=b83698edc796f48a:T=1557229341:S=ALNI_MZYuxiKvMlIMXV92xfTw1XB6ms7EA; __qca=P0-643878854-1557229359497; _gid=GA1.2.1904385410.1557829332; _ncg_g_id_=9d6dce3e-2d15-45b6-a948-9dbb7fa69171; OX_plg=pm; _ncg_id_=16a959d5d8b-5c91174d-2719-4974-99d8-33e86e4219c2; _pc_morningReportRan=true; _sp_ses.3725=*; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://nypost.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1557886391506%2C%22slts%22:1557829347024}; _parsely_visitor={%22id%22:%2236e3895b-5884-4e1e-b290-0b0a1e631850%22%2C%22session_count%22:3%2C%22last_session_ts%22:1557886391506}; AMP_TOKEN=%24NOT_FOUND; bounceClientVisit2045v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgHYCeEA9iggHQDGFAtkcQKYDu6BIANCAE4wQZStXpMQAXyA; _ncg_sp_ses.64db=*; _gat=1; __idcontext=eyJjb29raWVJRCI6IlpORUtKQzRVUEwzNUhUUTI2QkNJVUIySDZZVUFOTjI1V0E0VzI1WERQSU9RPT09PSIsImRldmljZUlEIjoiWk5FS0pDNFVPVFU1NzVaQTVZSU1LVlNHUlVFU0pOM0JWNEdTWVJHNUM0WEE9PT09IiwiaXYiOiJQR1ZMV1NQWTc2R0pGMkhISUJCVEpBTEc0UT09PT09PSIsInYiOjF9; __pvi=%7B%22id%22%3A%22v-2019-05-15-11-28-33-472-TCTTCzJ3MeatqGej-a5838d1dc51fc02369a5c570d5bb61d6%22%2C%22domain%22%3A%22.nypost.com%22%2C%22time%22%3A1557891488612%7D; __adblocker=false; xbc=%7Bjzx%7DCTyXA66nwH4u0LSnMj_hrMtwYTk54JF59dLs5o_wp3snMXNdvj2Yy6TBtbRxyGxf14_VW1q5TLlW6vo43sH4bt1xlU681XmGmmXaT-SetcMReVqnxTFjI2gW-7RAeJAQFo8mvk88JA2ghePCorbhbWMs02tfzF_-k1Krwk0Vz5I_4BWDD33FM1fohQjjcgYaPM-1rt-sKsCEnjEZlCFDpqiFO54mgbKUB-kFVcHhi-_WjEFJazS2Vtn_ZZJHi-y44g16CXbGiqpHfoDR9DPafHAts-4n-G65fMRtwt9Ml8JaS73yz78cdU_g515IoAaF5TiHkpwV8OOumbfwBrkq2AU3h3dtbnjKZd070tIlyyZdFCbfpjqxaxax2jiN0PitRuCioMt8p4TO3fxq6ok4tA; _ncg_sp_id.64db=d08b0f08-4e58-40a9-8cd3-63efa5ae79b6.1557229345.5.1557891492.1557886389.183b444b-a79a-4498-bd82-c06c607b176e; _sp_id.3725=b96eefdc3adfd036.1557229358.3.1557891502.1557831911'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/nypost/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        pg = 1
        while pg < 3:
            url_dic = {
                "news":
                'https://nypost.com/news/page/{}/'.format(pg),
                "metro":
                'https://nypost.com/metro/page/{}/'.format(pg),
                "pagesix":
                'https://pagesix.com/page/{}/'.format(pg),
                "basketball":
                'https://nypost.com/basketball/page/{}/'.format(pg),
                "baseball":
                'https://nypost.com/baseball/page/{}/'.format(pg),
                "football":
                'https://nypost.com/football/page/{}/'.format(pg),
                "college":
                'https://nypost.com/college/page/{}/'.format(pg),
                "hockey":
                'https://nypost.com/hockey/page/{}/'.format(pg),
                "business":
                'https://nypost.com/business/page/{}/'.format(pg),
                "opinion":
                'https://nypost.com/opinion/page/{}/'.format(pg),
                "entertainment":
                'https://nypost.com/entertainment/page/{}/'.format(pg),
                "fashion":
                'https://nypost.com/fashion/page/{}/'.format(pg),
                "living":
                'https://nypost.com/living/page/{}/'.format(pg),
                "tech":
                'https://nypost.com/tech/page/{}/'.format(pg),
            }
            for kw in url_dic:
                print('keyword:\t', kw)
                self.parsing_list_page_url(list_url=url_dic[kw])
            pg += 1

    def parsing_list_page_url(self, list_url):
        res = requests.get(url=list_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        html = etree.HTML(res)
        time.sleep(random.uniform(2, 5))
        details_urls_list = html.xpath('//h3[@class="entry-heading"]//a/@href')
        title_list = html.xpath('//h3[@class="entry-heading"]//a/text()')
        releaseTime_list = html.xpath('//div[@class="entry-meta"]//p//text()')
        for i in range(len(details_urls_list)):
            sourceUrl = details_urls_list[i]
            title = title_list[i]
            releaseTime = releaseTime_list[i]
            source_headers = {
                'sourceUrl': sourceUrl,
                'title': title,
                'releaseTime': releaseTime
            }
            self.parsing_details_page_url(source_headers=source_headers, )

    def parsing_details_page_url(self, source_headers):
        status = self.filter.filter_data(
            details_url=source_headers['sourceUrl'])
        if status:
            print('Data already exists!')
        else:
            res = requests.get(url=source_headers['sourceUrl'],
                               headers=self.headers,
                               cookies=self.cookies).text
            html = etree.HTML(res)
            time.sleep(random.uniform(1, 3))
            source = int(10)
            sourceUrl = source_headers['sourceUrl']
            jobId = time.time()
            title = source_headers['title']
            releaseTime = source_headers['releaseTime']
            authorName = ''.join(html.xpath('//p[@class="byline"]//a/text()'))
            content = self.parsing_news_content(content_html=res,
                                                html_obj=html,
                                                newspaper=True)
            img = self.download_img(html_obj=html)
            if img is None or img == '' or content is None or content == '':
                pass
            else:
                data = {
                    'source': source,
                    'jobId': int(jobId),
                    'sourceUrl': sourceUrl,
                    'title': title,
                    'authorName': authorName,
                    'releaseTime': releaseTime,
                    'content': content,
                    'img': img
                }
                print('data:\n', data)
                self.save.save_data(data=data, news='nowYorkPost')

    def parsing_news_content(self,
                             content_html=None,
                             html_obj=None,
                             newspaper=False):
        if newspaper:
            text = fulltext(content_html).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
        else:
            content_list = html_obj.xpath(
                '//div[@id="news-content"]//p/text()')
            content = '<p>'.join([
                i.replace("\n", '').strip() for i in content_list
            ]).replace("<p><p>", '<p>')
        return content

    def download_img(self, html_obj):
        pic_list_1 = html_obj.xpath('//div[@class="featured-image"]/img/@src')
        pic_list_2 = html_obj.xpath(
            '//div[@class="article-header"]//img/@data-srcset')
        pic_url_list = pic_list_1 + pic_list_2
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        if pic_url_list == []:
            return None
        else:
            for pic_url in pic_url_list[:17]:
                urllib.request.urlretrieve(
                    pic_url, r'%s.jpg' % (self.downloadPath + self.picPath +
                                          str(img_id) + "-" + str(index)))
                img_list.append(
                    r'%s.jpg' %
                    (self.picPath + str(img_id) + "-" + str(index)))
                index += 1
            img = ','.join(img_list)
            return img
コード例 #29
0
ファイル: uproxx.py プロジェクト: zhangpeng0v0/news
class UPROXX_News():
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
        }
        self.cookies = {
            'cookie':
            '_ga=GA1.2.1916480018.1557387143; _omappvp=NXTUG1O09XwizTEVPHY0CDatCFaa7zmyENXMZ3yBzNBfpRrUJyJSjzawWbNOtmCk3a0M6l51v1hv01nhoAdHqIQLyxntcGlZ; __gads=ID=967d7ff68a5a2656:T=1557387149:S=ALNI_MZFY5Q_tfI8WS1_30SK817ySI14RQ; _cb_ls=1; _cb=BvCRDMN1EZ-CKUJwA; _scid=12f16568-c9b2-4331-9513-626f26e7aac6; _fbp=fb.1.1558321180103.1335358405; __qca=P0-305115545-1558321179420; _chartbeat2=.1558321174581.1558322228287.1.w0ijvCb8zgbDJfkouB1YhL9BM0Wu2.15; _sctr=1|1559059200000; _gid=GA1.2.676333276.1559707655; _cmpQcif3pcsupported=1; _parsely_visitor={%22id%22:%22f94909f1-8e1d-499d-8590-04e058a8acdf%22%2C%22session_count%22:4%2C%22last_session_ts%22:1559707963140}; _parsely_slot_click={%22url%22:%22https://uproxx.com/dimemag/demarcus-cousins-warriors-game-2-nba-finals-passing-analysis-videos/%22%2C%22x%22:1163%2C%22y%22:0%2C%22xpath%22:%22//*[@id=%5C%22menu-item-1560569%5C%22]/a[1]%22%2C%22href%22:%22https://uproxx.com/news%22}; _threds=1; _thredb=uproxx.76a113a16f1e45e5bf36b23bf05e76a6.1558321178020.1559712981550.1559713181162.30.6; _gat_auPassiveTagger=1; _gat=1'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/uproxx/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        pg = 1
        while pg < 10:
            start_url = 'https://uproxx.com/wp-json/wovenis/v1/home/{}?offset=34'.format(
                1)
            self.parsing_news_list_page(url=start_url)
            pg += 1

    def parsing_news_list_page(self, url):
        res = requests.get(url=url, headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(1, 3))
        js = json.loads(res)
        html = js['html']
        html_obj = etree.HTML(html)
        url_list = html_obj.xpath('//h2/a/@href')
        for i in url_list:
            status = self.filter.filter_data(details_url=i)
            if status:
                print('Data already exists!')
            else:
                try:
                    self.parsing_details_page(details_url=i)
                except:
                    pass

    def parsing_details_page(self, details_url):
        res = requests.get(url=details_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(1, 2))
        html = etree.HTML(res)
        source = int(13)
        jobId = time.time()
        sourceUrl = details_url
        title = ''.join(html.xpath('//div[@class="post-top"]//h1//text()'))
        authorName = html.xpath('//span[@class="authorname"]//text()')[0]
        releaseTime = html.xpath(
            '//span[@class="published-date uproxx-the-date"]//text()')[0]
        content = self.analysis_news_content(html=res)
        img = self.analysis_news_img(html_obj=html)
        if img is None or img == '' or content is None or content == '':
            pass
        else:
            data = {
                'source': source,
                'jobId': int(jobId),
                'sourceUrl': sourceUrl,
                'title': title,
                'authorName': authorName,
                'releaseTime': releaseTime,
                'content': content,
                'img': img
            }
            print('data:\n', data)
            self.save.save_data(data=data, news='UPROXX')

    def analysis_news_content(self, html):
        text = fulltext(html).split('\n')
        txt = list(filter(lambda x: x.strip() != '', text))
        content = '<p>'.join(txt)
        return content

    def analysis_news_img(self, html_obj):
        pic_url_list = html_obj.xpath('//div[@class="ug_page"]//img/@src')
        img_id = str(uuid.uuid4()).replace('-', '')
        index = 1
        img_list = []
        if pic_url_list == []:
            return None
        else:
            try:
                for pic_url in pic_url_list[:17]:
                    response = requests.get(pic_url)
                    image = Image.open(BytesIO(response.content))
                    image.save(r'%s.jpg' % (self.downloadPath + self.picPath +
                                            str(img_id) + "-" + str(index)))
                    img_list.append(
                        r'%s.jpg' %
                        (self.picPath + str(img_id) + "-" + str(index)))
                    index += 1
                img = ','.join(img_list)
                return img
            except:
                return None
コード例 #30
0
class CBS_News():
    def __init__(self):
        self.headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
        }
        self.cookies = {
            'cookies':
            'fly_device=desktop; fly_geo={"countryCode": "cn"}; CBS_INTERNAL=0; _cb_ls=1; _cb=DrObeWDJQRFdCPmQx1; optimizelyEndUserId=oeu1556274100628r0.4116041118910556; __gads=ID=d68306632b854d8c:T=1556274103:S=ALNI_MYpAOeaoN_TEKi9ErEphorJuu4FxA; aam_uuid=38178500434044041890375836043549172921; _v__chartbeat3=DSbaGWCHXxS0C6XCeZ; first_page_today=false; cbsnews_ad=%7B%22type%22%3A%22gpt%22%2C%22region%22%3A%22aw%22%2C%22session%22%3A%22a%22%2C%22subSession%22%3A%223%22%7D; AMCVS_10D31225525FF5790A490D4D%40AdobeOrg=1; s_cc=true; OX_plg=pm; fly_vid=1a29bea6-1a13-4100-a305-ffa9b02166d3; pmtimesig=[[1556347239934,0],[1556350240525,3000591],[1556372772902,22532377]]; s_vnum=1558866104445%26vn%3D10; s_invisit=true; s_lv_undefined_s=Less%20than%201%20day; AMCV_10D31225525FF5790A490D4D%40AdobeOrg=1406116232%7CMCMID%7C37954619966530193010387509759393309121%7CMCAAMLH-1557023341%7C11%7CMCAAMB-1557023341%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1556425741s%7CNONE%7CvVersion%7C2.5.0; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; AAMC_cbsi_0=REGION%7C11%7CAMSYNCSOP%7C%7CAMSYNCS%7C; _cb_svref=null; _t_tests=eyJMdFRUYmdVZHBDcHBKIjp7ImNob3NlblZhcmlhbnQiOiJCIiwic3BlY2lmaWNMb2NhdGlvbiI6WyJEZlhyTVYiXX0sImxpZnRfZXhwIjoibSJ9; cbsn_device=desktop; muxData=mux_viewer_id=a3de65c6-88bd-4042-a748-fb385d2ada3d&msn=0.5261598146217972&sid=11df9f3c-9e4d-47e4-9786-2de0583451e8&sst=1556418792060&sex=1556421954813; GED_PLAYLIST_ACTIVITY=W3sidSI6ImdDTUIiLCJ0c2wiOjE1NTY0MjA0NTUsIm52IjoxLCJ1cHQiOjE1NTY0MjAxNDIsImx0IjoxNTU2NDIwNDU1fV0.; s_sq=%5B%5BB%5D%5D; prevPageType=topic_list; prevPageName=cbsnews:/latest/us/5/; s_getNewRepeat=1556420875652-Repeat; s_lv_undefined=1556420875654; utag_main=v_id:016a592a36a1009f5e955a97097003079001807100bd0$_sn:10$_ss:0$_st:1556422675588$vapi_domain:cbsnews.com$dc_visit:10$_pn:38%3Bexp-session$ses_id:1556418538777%3Bexp-session$dc_event:30%3Bexp-session$dc_region:eu-central-1%3Bexp-session; _chartbeat2=.1556274100027.1556420876067.111.atSntCpXEouDM4RkLBcjI23BVm-lP.40; s_ptc=%2Flatest%2Fus%2F5%2F%5E%5E0.00%5E%5E0.01%5E%5E0.28%5E%5E0.52%5E%5E0.63%5E%5E0.44%5E%5E5.08%5E%5E0.01%5E%5E6.59; RT="sl=38&ss=1556418537489&tt=40674&obo=1&sh=1556420880100%3D38%3A1%3A40674%2C1556420718464%3D37%3A1%3A34088%2C1556420455825%3D36%3A1%3A31715%2C1556420142482%3D35%3A1%3A30988%2C1556420128526%3D34%3A1%3A30943&dm=cbsnews.com&si=91b57407-760b-481b-87e3-bcff31d166db&bcn=%2F%2F173e2514.akstat.io%2F&ld=1556420880100&r=https%3A%2F%2Fwww.cbsnews.com%2Flatest%2Fus%2F5%2F&ul=1556420983930"'
        }
        self.downloadPath = '/data/crawler'
        self.picPath = '/cbs_news/picture/'
        self.filter = Filter_Data()
        self.save = Save_Data()

    def run(self):
        pg = 1
        while pg < 4:
            health = 'https://www.cbsnews.com/latest/health/{}/'.format(pg)
            world = 'https://www.cbsnews.com/latest/world/{}/'.format(pg)
            crime = 'https://www.cbsnews.com/latest/crime/{}/'.format(pg)
            entertainment = 'https://www.cbsnews.com/latest/entertainment/{}/'.format(
                pg)
            science = 'https://www.cbsnews.com/latest/science/{}/'.format(pg)
            technology = 'https://www.cbsnews.com/latest/technology/{}/'.format(
                pg)

            cbs.parsing_health_news_list_page(start_url=health)
            cbs.parsing_word_news_list_page(start_url=world)
            cbs.parsing_crime_news_list_page(start_url=crime)
            cbs.parsing_entertainment_news_list_page(start_url=entertainment)
            cbs.parsing_science_news_list_page(start_url=science)
            cbs.parsing_technology_news_list_page(start_url=technology)
            pg += 1

    def parsing_health_news_list_page(self, start_url):
        res = requests.get(start_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        html = etree.HTML(res)
        list_page_url = html.xpath(
            '//section[@id="component-health"]//div[@class="component__item-wrapper"]//article//a/@href'
        )
        thumbnail_img = html.xpath(
            '//section[@id="component-health"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src'
        )
        for i in range(len(list_page_url)):
            data = self.parsing_details_page(details_url=list_page_url[i],
                                             thumbnail_img=thumbnail_img[i])
            if data is None:
                pass
            else:
                print('health_data\n', data)
                self.save.save_data(data=data, news='cbs')

    def parsing_word_news_list_page(self, start_url):
        res = requests.get(start_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        html = etree.HTML(res)
        list_page_url = html.xpath(
            '//section[@id="component-world"]//div[@class="component__item-wrapper"]//article//a/@href'
        )
        thumbnail_img = html.xpath(
            '//section[@id="component-world"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src'
        )
        for i in range(len(list_page_url)):
            data = self.parsing_details_page(details_url=list_page_url[i],
                                             thumbnail_img=thumbnail_img[i])
            if data is None:
                pass
            else:
                print('word_data\n', data)
                self.save.save_data(data=data, news='cbs')

    def parsing_crime_news_list_page(self, start_url):
        res = requests.get(start_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        html = etree.HTML(res)
        list_page_url = html.xpath(
            '//section[@id="component-crime"]//div[@class="component__item-wrapper"]//article//a/@href'
        )
        thumbnail_img = html.xpath(
            '//section[@id="component-crime"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src'
        )
        for i in range(len(list_page_url)):
            data = self.parsing_details_page(details_url=list_page_url[i],
                                             thumbnail_img=thumbnail_img[i])
            if data is None:
                pass
            else:
                print('crime_data\n', data)
                self.save.save_data(data=data, news='cbs')

    def parsing_entertainment_news_list_page(self, start_url):
        res = requests.get(start_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        html = etree.HTML(res)
        list_page_url = html.xpath(
            '//section[@id="component-entertainment"]//div[@class="component__item-wrapper"]//article//a/@href'
        )
        thumbnail_img = html.xpath(
            '//section[@id="component-entertainment"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src'
        )
        for i in range(len(list_page_url)):
            data = self.parsing_details_page(details_url=list_page_url[i],
                                             thumbnail_img=thumbnail_img[i])
            if data is None:
                pass
            else:
                print('entertainment_data\n', data)
                self.save.save_data(data=data, news='cbs')

    def parsing_science_news_list_page(self, start_url):
        res = requests.get(start_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        html = etree.HTML(res)
        list_page_url = html.xpath(
            '//section[@id="component-science"]//div[@class="component__item-wrapper"]//article//a/@href'
        )
        thumbnail_img = html.xpath(
            '//section[@id="component-science"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src'
        )
        for i in range(len(list_page_url)):
            data = self.parsing_details_page(details_url=list_page_url[i],
                                             thumbnail_img=thumbnail_img[i])
            if data is None:
                pass
            else:
                print('science_data\n', data)
                self.save.save_data(data=data, news='cbs')

    def parsing_technology_news_list_page(self, start_url):
        res = requests.get(start_url,
                           headers=self.headers,
                           cookies=self.cookies).text
        time.sleep(random.uniform(3, 5))
        html = etree.HTML(res)
        list_page_url = html.xpath(
            '//section[@id="component-technology"]//div[@class="component__item-wrapper"]//article//a/@href'
        )
        thumbnail_img = html.xpath(
            '//section[@id="component-technology"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src'
        )
        for i in range(len(list_page_url)):
            data = self.parsing_details_page(details_url=list_page_url[i],
                                             thumbnail_img=thumbnail_img[i])
            if data is None:
                pass
            else:
                print('technology_data\n', data)
                self.save.save_data(data=data, news='cbs')

    def parsing_details_page(self, details_url, thumbnail_img):
        result = self.filter.filter_data(details_url=details_url)
        if result:
            print('Data already exists!')
        else:
            details_res = requests.get(details_url,
                                       headers=self.headers,
                                       cookies=self.cookies).text
            time.sleep(random.uniform(1, 3))
            html = etree.HTML(details_res)
            source = int(7)
            sourceUrl = details_url
            jobId = time.time()
            title = ''.join(html.xpath('//h1[@class="content__title"]/text()'))
            text = fulltext(details_res).split('\n')
            txt = list(filter(lambda x: x.strip() != '', text))
            content = '<p>'.join(txt)
            author = html.xpath(
                '//p[@class="content__meta content__meta-byline"]/text()')
            authorName = ''.join(
                [i.replace("/n", '<p>').strip() for i in author])
            releaseTimeList = html.xpath(
                '//p[@class="content__meta content__meta-timestamp"]/time/text()'
            )
            releaseTime = ''.join(
                [i.replace("/n", '<p>').strip() for i in releaseTimeList])
            pic_url_list = html.xpath(
                '//span[@class="img embed__content"]//img/@src')
            img = self.download_pic(pic_url_list=pic_url_list,
                                    thumbnail_img=thumbnail_img)
            if img is None or img == '' or content is None or content == '' or title is None or title == '':
                pass
            else:
                return {
                    'source': source,
                    'jobId': int(jobId),
                    'sourceUrl': sourceUrl,
                    'title': title,
                    'authorName': authorName,
                    'releaseTime': releaseTime,
                    'content': content,
                    'img': img
                }

    def download_pic(self, pic_url_list, thumbnail_img):
        try:
            img_id = str(uuid.uuid4()).replace('-', '')
            index = 1
            img_list = []
            if pic_url_list == []:
                urllib.request.urlretrieve(
                    thumbnail_img,
                    r'%s.jpg' % (self.downloadPath + self.picPath +
                                 str(img_id) + "-" + str(index)))
                img = r'%s.jpg' % (self.picPath + str(img_id) + "-" +
                                   str(index))
                return img
            else:
                for pic_url in pic_url_list[:17]:
                    urllib.request.urlretrieve(
                        pic_url,
                        r'%s.jpg' % (self.downloadPath + self.picPath +
                                     str(img_id) + "-" + str(index)))
                    img_list.append(
                        r'%s.jpg' %
                        (self.picPath + str(img_id) + "-" + str(index)))
                    index += 1
                img = ','.join(img_list)
                return img
        except:
            return None