def __init__(self): self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'} self.cookies = {'cookie': '_cb_ls=1; _cb=ChGdwsejPcBwqK1A; _ga=GA1.2.1067424464.1556266698; __gads=ID=b2804ef9280ce726:T=1556266708:S=ALNI_MbsZp6KMsLTd9MAhzM98UpWqF4sEQ; __qca=P0-112096547-1556266838413; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; GED_PLAYLIST_ACTIVITY=W3sidSI6Ilp4Q0YiLCJ0c2wiOjE1NTY2MTc5NjcsIm52IjowLCJ1cHQiOjE1NTY2MTc5NjAsImx0IjoxNTU2NjE3OTYwfV0.; _gid=GA1.2.1304411157.1557027854; _cb_svref=null; OptanonConsent=landingPath=NotLandingPage&datestamp=Sun+May+05+2019+11%3A44%3A56+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=4.1.0&EU=false&groups=0_140011%3A1%2C1%3A1%2C0_140010%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2C0_140046%3A1%2C0_140042%3A1%2C0_140038%3A1%2C0_140034%3A1%2C0_140055%3A1%2C0_140051%3A1%2C0_140047%3A1%2C0_140043%3A1%2C0_140039%3A1%2C0_140035%3A1%2C0_140031%3A1%2C0_140052%3A1%2C0_140048%3A1%2C0_140044%3A1%2C0_140040%3A1%2C0_140036%3A1%2C0_140032%3A1%2C0_140053%3A1%2C0_140049%3A1%2C0_140045%3A1%2C0_140041%3A1%2C0_140037%3A1%2C0_140033%3A1%2C0_140054%3A1%2C0_140050%3A1%2C101%3A1%2C102%3A1%2C103%3A1%2C104%3A1%2C105%3A1%2C106%3A1%2C107%3A1%2C108%3A1%2C109%3A1%2C110%3A1%2C111%3A1%2C112%3A1%2C113%3A1%2C114%3A1%2C115%3A1%2C116%3A1%2C117%3A1%2C118%3A1%2C119%3A1%2C120%3A1%2C121%3A1%2C122%3A1%2C123%3A1%2C124%3A1%2C125%3A1%2C126%3A1%2C127%3A1%2C128%3A1%2C129%3A1%2C130%3A1%2C131%3A1%2C132%3A1%2C133%3A1%2C134%3A1%2C135%3A1%2C136%3A1%2C137%3A1%2C138%3A1%2C139%3A1%2C140%3A1%2C141%3A1%2C142%3A1%2C143%3A1%2C144%3A1%2C145%3A1%2C146%3A1%2C147%3A1%2C148%3A1%2C149%3A1%2C150%3A1%2C151%3A1%2C152%3A1%2C153%3A1%2C154%3A1%2C155%3A1&AwaitingReconsent=false; _tb_sess_r=; _tb_t_ppg=https%3A//apnews.com/245117b7dafd4790ba3d51db06cf345a; _gat=1; _chartbeat2=.1556266696382.1557028669628.1111100001.Vfd8vwJvnJujXq7Dq7JmkgXZfl.4'} self.downloadPath = '/data/crawler' self.picPath = '/ap_news/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'} self.cookies = {'cookie': '__cfduid=d6ba6448200002747444269a19593dbdd1555908016; __cfruid=985eba5fa2a449247bfd0598c1c1c5ec968a9416-1558490711; _ga=GA1.2.1064338879.1558490714; _gid=GA1.2.136804405.1558490714; lightstep_guid/medium-web=8f0cd65b0ef4abdb; lightstep_session_id=fcac5cf910466bc4; pr=1; tz=-480; uid=3314454e53ae; sid=1:4N4F93p0H1gPvFCIGldZUdIdQeFiifNF6stzqPFyBikCsGpjcmnIyu/NNWwIVVTx; xsrf=89TsRPcZaZKu; lightstep_guid/lite-web=7d9b16045b97b840; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://medium.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1558512703778%2C%22slts%22:1558503751909}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:3%2C%22last_session_ts%22:1558512703778}; sz=1905'} self.downloadPath = '/data/crawler' self.picPath = '/huffpost/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } self.downloadPath = '/data/crawler' self.picPath = '/smartNews/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.news_api = NewsApiClient(api_key='f04f7a8db32841299d4a7fae723e61b2') self.t = time.time() self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t)) self.keyword = ['us', 'word', 'opinion', 'politics', 'entertainment', 'lifestyle', 'health', 'travel', 'autos'] self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} self.cookies = {'cookie': '_cb_ls=1; optimizelyEndUserId=oeu1556269407120r0.4256555044820445; cto_lwid=a3569f8e-fd62-48fd-8cf3-52e3a3d49218; _gcl_au=1.1.1392012605.1556269408; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22cfa5a6d1-cac6-4a48-97ed-e2a25488a94a%22; _ga=GA1.2.353904812.1556269412; _cb=D6-ViRhsUuoBSGama; __gads=ID=0a226a472ca026e8:T=1556269422:S=ALNI_Mb8qEqiRmqgHFem87cBOSEiCTTaJQ; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _scid=47caae3a-e216-48d9-8cdc-3159238a7671; FXN_flk=1; AMCVS_17FC406C5357BA6E0A490D4D%40AdobeOrg=1; _gid=GA1.2.1114110874.1557801782; s_cc=true; _csrf=qWmVWRGxKfzqXCxI9_yuGfZI; s_sq=%5B%5BB%5D%5D; AKA_A2=A; ak_bmsc=3362DC65CD8C5F6FE2F5F2E24D7DD7FE6876060DED3200004C65DA5C0E141B34~pl9V8ncmx0JI/913nUJgfYoKX6Gte64URfMw4gBpTiaQPEzpKVnyOxRIc/NBeHS9HwdJZ+Fd5cB6oDFLpRNLt93qTu4fSjWuP7e+PZea5EArlAr63c0rHI5P+U7hKycyZfvpMt2MSsmqLqtUqZqavEQxBprGj74WIJ0a5ZnH2vSP1CYH+4ijzZPqw/REPx+WlZ+jHCptyFj7C9pjBHstMpWmr4RW6NTHMwyBsckJbiQr0p+5gPNq/FUjz06HN7q/b4; _cb_svref=null; AMCV_17FC406C5357BA6E0A490D4D%40AdobeOrg=2121618341%7CMCIDTS%7C18031%7CMCMID%7C37985443320715041480395091296536963184%7CMCAAMLH-1557842971%7C7%7CMCAAMB-1558421455%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1557823855s%7CNONE%7CMCAID%7CNONE; s_pers=%20s_ppn%3Dfnc%253Aroot%253Aroot%253Achannel%7C1557806491239%3B%20omtr_lv%3D1557816723185%7C1652424723185%3B%20omtr_lv_s%3DLess%2520than%25201%2520day%7C1557818523185%3B%20s_nr%3D1557816723191-Repeat%7C1560408723191%3B; _chartbeat2=.1556269420027.1557816723254.0000000010000001.CY0VlWCO9QUgDFFbO8QLxoyCPj7ho.2; s_sess=%20omtr_evar17%3DD%253Dc17%3B%20s_ppvl%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C22%252C83%252C5886%252C1920%252C925%252C1920%252C1080%252C1%252CL%3B%20SC_LINKS%3D%3B%20s_ppv%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C63%252C96%252C3550%252C1920%252C969%252C1920%252C1080%252C1%252CL%3B; criteo_write_test=ChUIBBINbXlHb29nbGVSdGJJZBgBIAE; bm_sv=8A6F070ED17B9F85AD022D562A830573~oN82OtrVhgL99OXQYjpsFWPKOuwBoUVwy60qge23Kx9pNN2MIe3/AhQZJZ+na42MjDAIyCRuvDS6csM6csNzVnCY/0Ue7dXJIHzFvEjq/KcL+5X57fiZK5b9W/W3g/hw1kSCvVxA/GNO4h9IlDmY6OElMgVSqN2h9kq42m6z+n0='} self.downloadPath = '/data/crawler' self.picPath = '/fox_news/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookie': '_ga=GA1.2.2006098489.1555559856; _fbp=fb.1.1555559860721.1190642659; __qca=P0-464700868-1555559857580; permutive-id=35526ebd-337f-4b00-bf5b-10a6610a85a5; __gads=ID=2cb4be529258fba6:T=1555559912:S=ALNI_MawMBKcEjsbSC3roAOVcCQm5lCB2A; _pdfps=%5B7684%2C13160%2C13164%2C13319%2C13730%2C14474%2C10166%2C12448%2C12449%2C12882%2C13097%2C13214%2C13217%2C13276%2C13278%2C13834%2C14353%2C10748%2C10788%2C13102%2C13144%2C13145%2C13146%2C13147%2C13150%2C13151%2C13157%2C13163%2C13169%2C13667%2C14437%2C14458%2C10224%2C10915%2C13153%2C13675%2C14142%2C13064%2C13216%2C13279%2C14431%2C14432%2C10749%2C10789%2C10906%2C10916%2C10917%2C11655%2C12233%2C12244%2C12679%2C12985%2C13099%2C13101%2C13148%2C13244%2C13741%2C13742%2C14143%2C14479%2C14872%2C15077%2C15128%2C15139%2C10222%2C13100%2C10216%2C%2212244-15-22969%22%2C%2212244-15-22970%22%2C%2212679-5-118997US%22%2C%2212985-5-118497US%22%2C%2213244-5-325997US%22%2C%2213245-5-325997US%22%2C%2213246-5-325997US%22%2C%2213458-15-22969%22%2C%2213458-15-22970%22%2C%2213459-15-22969%22%2C%2213459-15-22970%22%2C%2214229-5-318346US%22%2C%2214351-15-22835%22%2C%2214479-5-325547US%22%2C%2214872-15-22835%22%2C%2214872-15-22814%22%2C%2215063-5-318346US%22%2C%2215063-5-325346US%22%5D; permutive-session=%7B%22session_id%22%3A%2215c3b65a-580a-47aa-ab95-b44076421376%22%2C%22last_updated%22%3A%222019-04-27T02%3A27%3A38.962Z%22%7D; _cmpQcif3pcsupported=1; _gid=GA1.2.13310005.1557196067; _gat=1; sailthru_pageviews=4; sailthru_content=cbe347ea3dd8f028b2a79dd2124b2609d73dc57549ee138bd1d9dedee18e797c3cde4668fc0929097a33767e5b408948300e3683df34cf01dce50805bbb1306ce0bce460f7e70fed288b52d84bd9816499693f0167a253c9d1ba851de3a9d8e9dd7ae6730eff39df6f3b2fee47cae2908e3260668e0361ea9bd2ebb68e2a0591e9ec864cd274cc1d8b3a98016c2bcf1d874e57a78b55d2f981aeb6d2c79bfecc9d43236330abbff1afd96b7ffa626bb4936065bb0196c7181b628021dea483cf13a2f044347925f429d5fbc7008162c9cd736b79ca68d62341101204bca0cca1ff22ee54be7fa316d48db768db05dda4f044956926b209e90497a64953e290f7; sailthru_visitor=a057d87e-b51c-4f1e-9167-d146c2a3a7bc' } self.downloadPath = '/data/crawler' self.picPath = '/buzzfeed/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } self.cookies = { 'cookie': 'optimizelyEndUserId=oeu1555918735298r0.6812295616493853; _ga=GA1.2.501461025.1557229334; __pnahc=0; __tbc=%7Bjzx%7DbOREsfUR6SMcRp5niCu4XyJqGIm9xLbU2svbGCB3e5Y-ZlpcwIXRF_gOx5ssrGMlRCzVeeO-JA50xgthIobqDMJS2og0GbDQCa7bPklPxk1yFokaLXVHvRa0s4J7s817Uqt8s09tJ4GcmUzNoGeVhA; __pat=-14400000; __gads=ID=b83698edc796f48a:T=1557229341:S=ALNI_MZYuxiKvMlIMXV92xfTw1XB6ms7EA; __qca=P0-643878854-1557229359497; _gid=GA1.2.1904385410.1557829332; _ncg_g_id_=9d6dce3e-2d15-45b6-a948-9dbb7fa69171; OX_plg=pm; _ncg_id_=16a959d5d8b-5c91174d-2719-4974-99d8-33e86e4219c2; _pc_morningReportRan=true; _sp_ses.3725=*; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://nypost.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1557886391506%2C%22slts%22:1557829347024}; _parsely_visitor={%22id%22:%2236e3895b-5884-4e1e-b290-0b0a1e631850%22%2C%22session_count%22:3%2C%22last_session_ts%22:1557886391506}; AMP_TOKEN=%24NOT_FOUND; bounceClientVisit2045v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgHYCeEA9iggHQDGFAtkcQKYDu6BIANCAE4wQZStXpMQAXyA; _ncg_sp_ses.64db=*; _gat=1; __idcontext=eyJjb29raWVJRCI6IlpORUtKQzRVUEwzNUhUUTI2QkNJVUIySDZZVUFOTjI1V0E0VzI1WERQSU9RPT09PSIsImRldmljZUlEIjoiWk5FS0pDNFVPVFU1NzVaQTVZSU1LVlNHUlVFU0pOM0JWNEdTWVJHNUM0WEE9PT09IiwiaXYiOiJQR1ZMV1NQWTc2R0pGMkhISUJCVEpBTEc0UT09PT09PSIsInYiOjF9; __pvi=%7B%22id%22%3A%22v-2019-05-15-11-28-33-472-TCTTCzJ3MeatqGej-a5838d1dc51fc02369a5c570d5bb61d6%22%2C%22domain%22%3A%22.nypost.com%22%2C%22time%22%3A1557891488612%7D; __adblocker=false; xbc=%7Bjzx%7DCTyXA66nwH4u0LSnMj_hrMtwYTk54JF59dLs5o_wp3snMXNdvj2Yy6TBtbRxyGxf14_VW1q5TLlW6vo43sH4bt1xlU681XmGmmXaT-SetcMReVqnxTFjI2gW-7RAeJAQFo8mvk88JA2ghePCorbhbWMs02tfzF_-k1Krwk0Vz5I_4BWDD33FM1fohQjjcgYaPM-1rt-sKsCEnjEZlCFDpqiFO54mgbKUB-kFVcHhi-_WjEFJazS2Vtn_ZZJHi-y44g16CXbGiqpHfoDR9DPafHAts-4n-G65fMRtwt9Ml8JaS73yz78cdU_g515IoAaF5TiHkpwV8OOumbfwBrkq2AU3h3dtbnjKZd070tIlyyZdFCbfpjqxaxax2jiN0PitRuCioMt8p4TO3fxq6ok4tA; _ncg_sp_id.64db=d08b0f08-4e58-40a9-8cd3-63efa5ae79b6.1557229345.5.1557891492.1557886389.183b444b-a79a-4498-bd82-c06c607b176e; _sp_id.3725=b96eefdc3adfd036.1557229358.3.1557891502.1557831911' } self.downloadPath = '/data/crawler' self.picPath = '/nypost/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } self.cookies = { 'cookie': 'rxx=293dmvskaws.1i9frk99&v=1; _ga=GA1.2.893461412.1556504122; _fbp=fb.1.1556504132562.1028440015; __pnahc=0; __pat=-25200000; OTH=v=1&d=eyJraWQiOiIwMTY0MGY5MDNhMjRlMWMxZjA5N2ViZGEyZDA5YjE5NmM5ZGUzZWQ5IiwiYWxnIjoiUlMyNTYifQ.eyJjdSI6eyJndWlkIjoiUDJCVVRPR0RVT0VGVERUV0pNTFdKNFlFSDQiLCJzaWQiOiJEYTNsbjdKbG11MmwifX0.HoiBv5OlQvNY2x6q-LJBN-VzgCErT7GTCnODqLLQ8foasqTVUCPVXvwHFniFc7CwCf0n7lmSgfrSycQNevIFSJHZ7M-S9SRQH4FMtu91qykbuvAzAOQZRw_iz_warZWFJtpIys0EVH4Gn9wYqaqLXv-5lO39fuPsqJx9z7X6luQ; BX=e7bndb9eccnhl&b=4&d=lrdDlyNpYELw7nQr45ylAA--&s=8v&i=_g5gVQJJ34nc.9WZ3JGN; GUC=AQEAAQJc0SBdu0IgxgTZ&s=AQAAAJQ2Kk5V&g=XM_RDg; __tbc=%7Bjzx%7DjGAToaZMxJYLoS7N4KRjDaxHalABoj31MSFHkZP0UNxHLBrPMu6clUAaZwsaHnUnQQaMDnEIRO1fDpAMrkMVflCNhUFWsFFB8n1hsUBhKEKL38bZEAUprS1G6wPj4GNM4bchi9l7YPvr6or9wrNMLmWzw2hPXY5j7UVUWDOUH_U; __pcvc={}; _parsely_session={%22sid%22:6%2C%22surl%22:%22https://techcrunch.com/2019/05/04/uber-is-facing-australian-class-action-suit-alleging-unlawful-conduct/%22%2C%22sref%22:%22%22%2C%22sts%22:1558071232131%2C%22slts%22:1557885616789}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:6%2C%22last_session_ts%22:1558071232131}; cmp=t=1558071232&j=0; _gid=GA1.2.1358424281.1558071235; _gat=1; __adblocker=false; xbc=%7Bjzx%7DYW6Rlvft6bPCfQyJ3DedvFReFNeSWzD34uqjUgyftdmRMMeJaQrGxlc0RnHslaNJuW923ovrMyh3fAAIY_x7R_Da15zP9YopEn3Om90NI0T5GRkVz40I1R8zV8ZQB68kBF2YuF_JsLshS1YKLFcyLSN12KbxNP4vrnBqkqtIO2yaJ5LoTRrcAPA64ePs4VtlokVTqGlotnhRSiMBSeplyP6M0a5Lj5rCIn1GIetfFxi-gIZuaMlkdAHSSmrqD1nfLBrQXcHSWrDRR0PGzzVvFjSVEXhIbldyChWDeAkkgN0hgI8KXA304yID8T-gx9UZiwWN897EFpRv3ZNtbg5IqW5GixrDYN1X7y_FdQGe5c4Tlz-figdB5Mbe5Qj2godX23QAk9Y6PbNudCC8Em1tgOzteL0CnIShQ--XvwA9qsvEZSWlAxrGfFmStXYiVaTRc1BM1DSemqPeEIoI_XtXT1h-FOYTaDZfqgflEl3Qb8MlWCowztRcnRznul-OxLIUMkPAraljlm83Bs9Z0ZZTeULOzew-rPTbrZfnXeQjr8OJtrUbNexMaJib654rgmNL7kXPxNmVdB1ZWX5IXEgmiW4XKjZACr0RxZbzXhXFfEN9gPbI7xVJJD8kfmfWoGW_0O6MebIrRbW8xxFPLY90Mw; __pvi=%7B%22id%22%3A%22v-2019-05-17-13-33-55-875-qtmTHL61BOYny2co-1510a80d282f15b71b1e5f4d8bc358ee%22%2C%22domain%22%3A%22.techcrunch.com%22%2C%22time%22%3A1558072171993%7D' } self.downloadPath = '/data/crawler' self.picPath = '/techcrunch/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } self.cookies = { 'cookie': '_ga=GA1.2.1006188425.1558506407; __auc=c07ed86116ade38958b6f215c90; __gads=ID=ebab27bbe751d3d9:T=1558506409:S=ALNI_MbKANVLVZlZmub7wcHXVdRq__9uAQ; _fbp=fb.1.1558506413908.2064431331; cache-primed=1; mn-push-status=8; EU=(null); _gid=GA1.2.549309516.1558921947; __asc=2b5b28f116af808ea5c6cf504f0' } self.downloadPath = '/data/crawler' self.picPath = '/matador_network/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36' } self.cookies = { 'cookie': 'BX=cj1ovi5ee464n&b=3&s=nk; rxx=aflhe5fyk20.1j3ho4gz&v=1; _fbp=fb.1.1558321313127.2047077689; GUC=AQEBAQFc42Vdw0If_QRY&s=AQAAAB6nBWF3&g=XOIY0w; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _tb_sess_r=https%3A//www.huffpost.com/topic/nsfw%3Fpage%3D1; GED_PLAYLIST_ACTIVITY=W3sidSI6Im9TTWYiLCJ0c2wiOjE1NTgzNDA3MTMsIm52IjoxLCJ1cHQiOjE1NTgzNDA3MDYsImx0IjoxNTU4MzQwNzEzfSx7InUiOiIxSmhlIiwidHNsIjoxNTU4MzQwNjQ5LCJudiI6MSwidXB0IjoxNTU4MzQwNjM5LCJsdCI6MTU1ODM0MDY0OX1d; _tb_t_ppg=https%3A//www.huffpost.com/entry/nobuyoshi-araki-museum-of-sex_n_5a7c8c38e4b0c6726e10b29d' } self.downloadPath = '/data/crawler' self.picPath = '/huffpost/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookies': 'fly_device=desktop; fly_geo={"countryCode": "cn"}; CBS_INTERNAL=0; _cb_ls=1; _cb=DrObeWDJQRFdCPmQx1; optimizelyEndUserId=oeu1556274100628r0.4116041118910556; __gads=ID=d68306632b854d8c:T=1556274103:S=ALNI_MYpAOeaoN_TEKi9ErEphorJuu4FxA; aam_uuid=38178500434044041890375836043549172921; _v__chartbeat3=DSbaGWCHXxS0C6XCeZ; first_page_today=false; cbsnews_ad=%7B%22type%22%3A%22gpt%22%2C%22region%22%3A%22aw%22%2C%22session%22%3A%22a%22%2C%22subSession%22%3A%223%22%7D; AMCVS_10D31225525FF5790A490D4D%40AdobeOrg=1; s_cc=true; OX_plg=pm; fly_vid=1a29bea6-1a13-4100-a305-ffa9b02166d3; pmtimesig=[[1556347239934,0],[1556350240525,3000591],[1556372772902,22532377]]; s_vnum=1558866104445%26vn%3D10; s_invisit=true; s_lv_undefined_s=Less%20than%201%20day; AMCV_10D31225525FF5790A490D4D%40AdobeOrg=1406116232%7CMCMID%7C37954619966530193010387509759393309121%7CMCAAMLH-1557023341%7C11%7CMCAAMB-1557023341%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1556425741s%7CNONE%7CvVersion%7C2.5.0; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; AAMC_cbsi_0=REGION%7C11%7CAMSYNCSOP%7C%7CAMSYNCS%7C; _cb_svref=null; _t_tests=eyJMdFRUYmdVZHBDcHBKIjp7ImNob3NlblZhcmlhbnQiOiJCIiwic3BlY2lmaWNMb2NhdGlvbiI6WyJEZlhyTVYiXX0sImxpZnRfZXhwIjoibSJ9; cbsn_device=desktop; muxData=mux_viewer_id=a3de65c6-88bd-4042-a748-fb385d2ada3d&msn=0.5261598146217972&sid=11df9f3c-9e4d-47e4-9786-2de0583451e8&sst=1556418792060&sex=1556421954813; GED_PLAYLIST_ACTIVITY=W3sidSI6ImdDTUIiLCJ0c2wiOjE1NTY0MjA0NTUsIm52IjoxLCJ1cHQiOjE1NTY0MjAxNDIsImx0IjoxNTU2NDIwNDU1fV0.; s_sq=%5B%5BB%5D%5D; prevPageType=topic_list; prevPageName=cbsnews:/latest/us/5/; s_getNewRepeat=1556420875652-Repeat; s_lv_undefined=1556420875654; utag_main=v_id:016a592a36a1009f5e955a97097003079001807100bd0$_sn:10$_ss:0$_st:1556422675588$vapi_domain:cbsnews.com$dc_visit:10$_pn:38%3Bexp-session$ses_id:1556418538777%3Bexp-session$dc_event:30%3Bexp-session$dc_region:eu-central-1%3Bexp-session; _chartbeat2=.1556274100027.1556420876067.111.atSntCpXEouDM4RkLBcjI23BVm-lP.40; s_ptc=%2Flatest%2Fus%2F5%2F%5E%5E0.00%5E%5E0.01%5E%5E0.28%5E%5E0.52%5E%5E0.63%5E%5E0.44%5E%5E5.08%5E%5E0.01%5E%5E6.59; RT="sl=38&ss=1556418537489&tt=40674&obo=1&sh=1556420880100%3D38%3A1%3A40674%2C1556420718464%3D37%3A1%3A34088%2C1556420455825%3D36%3A1%3A31715%2C1556420142482%3D35%3A1%3A30988%2C1556420128526%3D34%3A1%3A30943&dm=cbsnews.com&si=91b57407-760b-481b-87e3-bcff31d166db&bcn=%2F%2F173e2514.akstat.io%2F&ld=1556420880100&r=https%3A%2F%2Fwww.cbsnews.com%2Flatest%2Fus%2F5%2F&ul=1556420983930"' } self.downloadPath = '/data/crawler' self.picPath = '/cbs_news/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } self.cookies = { 'cookie': '_ga=GA1.2.1916480018.1557387143; _omappvp=NXTUG1O09XwizTEVPHY0CDatCFaa7zmyENXMZ3yBzNBfpRrUJyJSjzawWbNOtmCk3a0M6l51v1hv01nhoAdHqIQLyxntcGlZ; __gads=ID=967d7ff68a5a2656:T=1557387149:S=ALNI_MZFY5Q_tfI8WS1_30SK817ySI14RQ; _cb_ls=1; _cb=BvCRDMN1EZ-CKUJwA; _scid=12f16568-c9b2-4331-9513-626f26e7aac6; _fbp=fb.1.1558321180103.1335358405; __qca=P0-305115545-1558321179420; _chartbeat2=.1558321174581.1558322228287.1.w0ijvCb8zgbDJfkouB1YhL9BM0Wu2.15; _sctr=1|1559059200000; _gid=GA1.2.676333276.1559707655; _cmpQcif3pcsupported=1; _parsely_visitor={%22id%22:%22f94909f1-8e1d-499d-8590-04e058a8acdf%22%2C%22session_count%22:4%2C%22last_session_ts%22:1559707963140}; _parsely_slot_click={%22url%22:%22https://uproxx.com/dimemag/demarcus-cousins-warriors-game-2-nba-finals-passing-analysis-videos/%22%2C%22x%22:1163%2C%22y%22:0%2C%22xpath%22:%22//*[@id=%5C%22menu-item-1560569%5C%22]/a[1]%22%2C%22href%22:%22https://uproxx.com/news%22}; _threds=1; _thredb=uproxx.76a113a16f1e45e5bf36b23bf05e76a6.1558321178020.1559712981550.1559713181162.30.6; _gat_auPassiveTagger=1; _gat=1' } self.downloadPath = '/data/crawler' self.picPath = '/uproxx/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } self.cookies = { 'Cookie': 'eu_cookie=1; _ga=GA1.2.716753130.1558322924; __qca=P0-1424062991-1558322923905; __gads=ID=1477ebfc328ade2f:T=1558322965:S=ALNI_MaqMSrLXw4oP8tpYvkTfPLW8rNP8g; OX_ssn=5819416341; _gid=GA1.2.90982873.1559799919; OX_plg=pm; OX_sd=3; looperSessionDepth=3; eu_cookie=1; cuid=5931835b1dcefbdb0a501558923072349_1562391999845; GED_PLAYLIST_ACTIVITY=W3sidSI6IndpbHciLCJ0c2wiOjE1NTk4MDAyOTIsIm52IjoxLCJ1cHQiOjE1NTk4MDAyOTAsImx0IjoxNTU5ODAwMjkxfSx7InUiOiJVYmRFIiwidHNsIjoxNTU5ODAwMDY3LCJudiI6MSwidXB0IjoxNTU5ODAwMDM0LCJsdCI6MTU1OTgwMDA2NX0seyJ1IjoieHl4NiIsInRzbCI6MTU1OTgwMDA1NiwibnYiOjEsInVwdCI6MTU1OTgwMDAzNCwibHQiOjE1NTk4MDAwNTZ9XQ..; _gat=1' } self.key_word = ['news', 'features', 'movies', 'television', 'comics'] self.downloadPath = '/data/crawler' self.picPath = '/looper/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537. 36' } self.cookies = { 'cookies': 'odin_tt=25f29c3c11ab624e32ea123b341f8e8ad3b9254cb1bcb00828ea8bbdf642ee3018a6a10f8ce2d4c3bb22af93a7fbcf4f44f76469931ce1241c8907041d196a1c; tt_webid=6675470162378032646; __tea_sdk__user_unique_id=6675470162378032646; __tea_sdk__ssid=f4cef532-3e68-4425-a4fa-8963bda2fdc3; csrf-token=da1ad8433b7acb6730721e47b072bc7ec710c4e3; csrf-secret=QBi0atkMP4iR2oosQVsHoAxAo7LA2Qzm' } self.keyword = [ 'foryou', 'entertainment', 'sports', 'lifestyle', 'gaming', 'food', 'tech', 'autos' ] self.t = time.time() self.downloadPath = '/data/crawler' self.picPath = '/topbuzz/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.news_api = NewsApiClient( api_key='cb7a4ae15a98429890aeedb9a7b460a0') self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookie': 'ckns_orb_fig_cache={%22ad%22:1%2C%22ap%22:4%2C%22ck%22:0%2C%22eu%22:0%2C%22uk%22:0}; ckns_sa_labels_persist={}; ckns_sscid=7f5aa895-8a47-4928-8632-ae8118032bcb; _cb_ls=1; _cb=CF1-z-CgoNtbBVcmNK; ckns_eds=INS-vt29-666188954:923108334-1556503556; ckns_settings-nonce=FwTiPYjnehKUtBu4zb7oIJ4j; amlbcookie=01; ckns_mvt=8c10379c-2f9b-44e6-a88b-97b0315adccb; ckns_account_experiments=j%3A%7B%22accxp_marketing_opt_in_2%22%3A%22control%22%7D; AWSELB=0FC55D47187ECE9190E70C0A017AC69A844CA844E9727B10D1C45E9E505E11A5757E62A62559CE5ECC76BE5C0D98ACC5FFDFADB0DF8505DDE5C427CC6C744FDB90DA13BB15F2555DE48D9361FEFE0FBEA45595E8C7; ckns_stateless=1; ckns_nonce=nzFw9J2FPDEn17WnPYS0LnRO; ckns_id=eyJhYiI6Im8xOCIsImVwIjp0cnVlLCJldiI6ZmFsc2UsInBzIjoicHVmZjhMV3pjSUlfckQ3RlkwaVo1V0dsM3czbFdBWDQ0TmVXNktKYjdDMCIsInNlcy1leHAiOjE1NTY1MTgzOTIwMDAsImp3dC1leHAiOjE2MTk1ODk0OTIwMDAsInRrbi1leHAiOjE1NTY1MjExMzEwMDAsInJ0a24tZXhwIjoxNjE5NTg5NDkyMDAwfQ; ckns_atkn=eyJ0eXAiOiJKV1QiLCJ6aXAiOiJOT05FIiwiYWxnIjoiSFMyNTYifQ.eyJzdWIiOiIzY2RiOWVkOC01ZjdmLTRlZWEtODYxNS1jMzZmMTdhZjZkMzEiLCJjdHMiOiJPQVVUSDJfU1RBVEVMRVNTX0dSQU5UIiwiYXV0aF9sZXZlbCI6MiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MCIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiYWNjZXNzX3Rva2VuIiwidG9rZW5fdHlwZSI6IkJlYXJlciIsImF1dGhHcmFudElkIjoidlhTaHVESDJRc3BOTTItZ0d3ek4yYlBJczRRIiwiYXVkIjoiQWNjb3VudCIsIm5iZiI6MTU1NjUxNzUzMSwiZ3JhbnRfdHlwZSI6InJlZnJlc2hfdG9rZW4iLCJzY29wZSI6WyJleHBsaWNpdCIsImltcGxpY2l0IiwicGlpIiwidWlkIiwib3BlbmlkIl0sImF1dGhfdGltZSI6MTU1NjUxNzQ5MSwicmVhbG0iOiIvIiwiZXhwIjoxNTU2NTI0NzMxLCJpYXQiOjE1NTY1MTc1MzEsImV4cGlyZXNfaW4iOjcyMDAsImp0aSI6IkZXcExhak13bmYxdUJyRWMtY0xaNnlpTUE1cyJ9.OhaC7wNmB_bALESjcJH8JjKcGRa-WaGkcZGWS0rhAtg; ckns_idtkn=eyJ0eXAiOiJKV1QiLCJraWQiOiJIa2d0WDBJd3RDOStSVGQvOWdYdFN0bk9VaU09IiwiYWxnIjoiUlMyNTYifQ.eyJhdF9oYXNoIjoiNGFfU2tJMWtQaVVZbks2VGlNSm9BdyIsInN1YiI6IjNjZGI5ZWQ4LTVmN2YtNGVlYS04NjE1LWMzNmYxN2FmNmQzMSIsImFiIjoibzE4IiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MSIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiaWRfdG9rZW4iLCJhdWQiOiJBY2NvdW50IiwiYWNyIjoiMCIsImF6cCI6IkFjY291bnQiLCJhdXRoX3RpbWUiOjE1NTY1MTc0OTEsInJlYWxtIjoiLyIsImV4cCI6MTU1NjUyMTEzMSwidG9rZW5UeXBlIjoiSldUVG9rZW4iLCJpYXQiOjE1NTY1MTc1MzF9.LqYjXmcfMMVfB3UupPV8oqez0gojKu-9anW-73WVKXOS5deEbwwYMrTr8JQy85WhwzlZNA5e8eqLPWJ_lAgfjCiw60zdYMxM_x_ZYaHtpPtAXf0SCOD8FlTBKnRZYDqKNkj8F22ctDqPUqrRrN-tDVTqrVMYW38sHqBeXalUGkw-2C24UBlE4DFcDqeqjn0pOFbwuFyQpgwrwp1y6UyUvF3WhuB6GVIkkKNUgYbWpnHTmP9OD8DNM_MH9TLDaC9SoRE5py51CpkZ78Y4rnQUAHeHibjbOwLKQkadVGhFzxr4vzxwJlRj_nrCySmrplgDJ7a9P_raVKfL4JH6UeA1_A; atuserid=%7B%22name%22%3A%22atuserid%22%2C%22val%22%3A%222a12b799-e590-476f-9b23-800e48e162f4%22%2C%22options%22%3A%7B%22end%22%3A%222020-05-30T05%3A59%3A40.931Z%22%2C%22path%22%3A%22%2F%22%7D%7D; _chartbeat2=.1556007561538.1556517584378.1000001.BzGCuqD5hQB-BMGlTiNhxZyCFMP2O.1; _cb_svref=https%3A%2F%2Fwww.bbc.com%2Fnews; ckps_id_ptrt=https%3A%2F%2Fwww.bbc.co.uk%2Fprogrammes%2Fw172wy08d8yw9mq; ecos.dt=1556517629804' } self.t = time.time() self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t)) self.keyword = [ 'News', 'Health', 'Science', 'Entertainment', 'Technology' ] self.downloadPath = '/data/crawler' self.picPath = '/bbc_news/picture/' self.filter = Filter_Data() self.save = Save_Data()
def __init__(self): self.news_api = NewsApiClient( api_key='e7d5104fc5c74e259dbe2427b68257fb') self.key_word = [ 'U.S.', 'Lifestyle', 'Technology', 'Entertainment', 'Sports', 'Health' ] self.t = time.time() self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t)) self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookie': 'cookieMonster=1; _cb_ls=1; SWID=522fc1e1-4ffd-4802-86fa-d7475f8dca57; optimizelyEndUserId=oeu1557122779744r0.05588715173473946; s_vi=[CS]v1|2E67E7728507B2CE-4000011580009D3A[CE]; __gads=ID=b52dc242d5ad893e:T=1557122797:S=ALNI_MaWnnuLLKP88qrPEsPEVuViaoGlJg; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; _cb=Dz5K21B0CX5JDkjMG; _v__chartbeat3=DbUUrKDDGTaEDqauaQ; _cb_svref=null; AkamaiAnalytics_BrowserSessionId=4d41ba72-5ab2-8f33-46aa-f5748aca9647; HTML_VisitIntervalStartTime=1557125661015; s_sess=%20s_cc%3Dtrue%3B%20s_sq%3D%3B; adnum=3undefined; _chartbeat2=.1557122809880.1557125676423.1.DoKBmGC2OW5QDWek5PEERi8oZtYZ.12; HTML_BitRateBucketCsv=0,19083,16715,0,0,0,0,0; HTML_VisitValueCookie=1|1|1|0|35798|35826|0|0|0|0|0|0|NaN; s_pers=%20s_fid%3D22C0AF24132A0778-001FDBB2DA7591AD%7C1620284117936%3B%20s_c20%3D1557125717941%7C1651733717941%3B%20s_c20_s%3DFirst%2520Visit%7C1557127517941%3B; HTML_isPlayingCount=2; GED_PLAYLIST_ACTIVITY=W3sidSI6IlhYU0wiLCJ0c2wiOjE1NTcxMjU3MzgsIm52IjowLCJ1cHQiOjE1NTcxMjU1NDQsImx0IjoxNTU3MTI1NjM3fSx7InUiOiIzbnlXIiwidHNsIjoxNTU3MTI1NzM3LCJudiI6MCwidXB0IjoxNTU3MTI1NTU2LCJsdCI6MTU1NzEyNTYyM30seyJ1IjoiWG81TSIsInRzbCI6MTU1NzEyNTczNywibnYiOjEsInVwdCI6MTU1NzEyNTU4NSwibHQiOjE1NTcxMjU3MzV9XQ..; HTML_VisitCountCookie=1' } self.downloadPath = '/data/crawler' self.picPath = '/abc_news/picture/' self.filter = Filter_Data() self.save = Save_Data()
class Looper_News(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } self.cookies = { 'Cookie': 'eu_cookie=1; _ga=GA1.2.716753130.1558322924; __qca=P0-1424062991-1558322923905; __gads=ID=1477ebfc328ade2f:T=1558322965:S=ALNI_MaqMSrLXw4oP8tpYvkTfPLW8rNP8g; OX_ssn=5819416341; _gid=GA1.2.90982873.1559799919; OX_plg=pm; OX_sd=3; looperSessionDepth=3; eu_cookie=1; cuid=5931835b1dcefbdb0a501558923072349_1562391999845; GED_PLAYLIST_ACTIVITY=W3sidSI6IndpbHciLCJ0c2wiOjE1NTk4MDAyOTIsIm52IjoxLCJ1cHQiOjE1NTk4MDAyOTAsImx0IjoxNTU5ODAwMjkxfSx7InUiOiJVYmRFIiwidHNsIjoxNTU5ODAwMDY3LCJudiI6MSwidXB0IjoxNTU5ODAwMDM0LCJsdCI6MTU1OTgwMDA2NX0seyJ1IjoieHl4NiIsInRzbCI6MTU1OTgwMDA1NiwibnYiOjEsInVwdCI6MTU1OTgwMDAzNCwibHQiOjE1NTk4MDAwNTZ9XQ..; _gat=1' } self.key_word = ['news', 'features', 'movies', 'television', 'comics'] self.downloadPath = '/data/crawler' self.picPath = '/looper/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): pg = 12 while pg < 24: for kw in self.key_word: url = 'https://www.looper.com/category/{}/?ajax=1&offset={}'.format( kw, pg) self.parsing_news_list_page(url=url) pg += 12 def parsing_news_list_page(self, url): res = requests.get(url=url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) html = etree.HTML(res) url_list = html.xpath('//h3/a/@href') for i in url_list: status = self.filter.filter_data(details_url=i) if status: pass else: self.parsing_details_page(details_url=i) def parsing_details_page(self, details_url): res = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) html = etree.HTML(res) source = int(13) sourceUrl = details_url jobId = time.time() title = ''.join(html.xpath('//h1[@class="title-gallery"]/text()')) authorName = ''.join( html.xpath('//div[@class="gallery-info"]/a/text()')) releaseTime = ''.join( html.xpath('//span[@class="news-timestamp"]/text()')) content = self.analysis_news_content(html=res, html_obj=html, newspaper=False) img = self.analysis_news_img(html_obj=html) if img is None or img == '' or content is None or content == '': pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='looper') def analysis_news_content(self, html, html_obj, newspaper=False): if newspaper: text = fulltext(html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) else: content_list = html_obj.xpath('//div[@id="content"]//p//text()') content = '<p>'.join([ i.replace("\n", '').strip() for i in content_list ]).replace("<p><p>", '<p>') return content def analysis_news_img(self, html_obj): pic_url_list = html_obj.xpath('//div[@id="content"]//img/@src') img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] try: for pic_url in pic_url_list[:17]: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None
class HuffPost_News(): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36' } self.cookies = { 'cookie': 'BX=cj1ovi5ee464n&b=3&s=nk; rxx=aflhe5fyk20.1j3ho4gz&v=1; _fbp=fb.1.1558321313127.2047077689; GUC=AQEBAQFc42Vdw0If_QRY&s=AQAAAB6nBWF3&g=XOIY0w; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _tb_sess_r=https%3A//www.huffpost.com/topic/nsfw%3Fpage%3D1; GED_PLAYLIST_ACTIVITY=W3sidSI6Im9TTWYiLCJ0c2wiOjE1NTgzNDA3MTMsIm52IjoxLCJ1cHQiOjE1NTgzNDA3MDYsImx0IjoxNTU4MzQwNzEzfSx7InUiOiIxSmhlIiwidHNsIjoxNTU4MzQwNjQ5LCJudiI6MSwidXB0IjoxNTU4MzQwNjM5LCJsdCI6MTU1ODM0MDY0OX1d; _tb_t_ppg=https%3A//www.huffpost.com/entry/nobuyoshi-araki-museum-of-sex_n_5a7c8c38e4b0c6726e10b29d' } self.downloadPath = '/data/crawler' self.picPath = '/huffpost/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): pg = 1 while pg < 11: start_url = 'https://www.huffpost.com/topic/nsfw?page={}'.format( pg) self.parsing_huffpost_news_list(list_url=start_url) pg += 1 def parsing_huffpost_news_list(self, list_url): res = requests.get(url=list_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) html = etree.HTML(res) news_list_url = html.xpath('//div[@class="card__content"]/a/@href') for details_url in news_list_url: try: self.parsing_details_page_url(details_url=details_url) except: pass def parsing_details_page_url(self, details_url): status = self.filter.filter_data(details_url=details_url) if status: print('Data already exists!') else: res = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) html = etree.HTML(res) source = int(12) jobId = time.time() sourceUrl = details_url title = ''.join( html.xpath('//h1[@class="headline__title"]//text()')) authorName = self.analysis_author_name(html=html) releaseTime = self.analysis_release_time(html=html).replace( "\n", '').strip() content = self.analysis_new_content(res=res, html=html, newspaper=False) img_list = self.analysis_download_img(html=html) img = self.download_pic(img_url_list=img_list) if img == '' or img is None or content == '' or content is None: pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='huffpost') def analysis_author_name(self, html): authorName = ''.join( html.xpath('//div[@class="author-list"]/span/text()')) if authorName == '' or authorName is None: authorName = ''.join( html.xpath('//div[@class="author-card__name"]//text()')) return authorName else: return authorName def analysis_release_time(self, html): releaseTime_1 = ''.join( html.xpath( '//div[@class="timestamp timestamp--has-modified-date"]//text()' )) if releaseTime_1 == '' or releaseTime_1 is None: releaseTime_2 = ''.join( html.xpath('//div[@class="timestamp"]//text()')) if releaseTime_2 == '' or releaseTime_2 is None: releaseTime_3 = ''.join( html.xpath( '//div[@class="timestamp timestamp--contributor timestamp--has-modified-date"]//text()' )) return releaseTime_3 else: return releaseTime_2 else: return releaseTime_1 def analysis_new_content(self, res, html, newspaper=False): if newspaper: text = fulltext(res).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content else: text = html.xpath( '//div[@class="content-list-component yr-content-list-text text"]//p//text()' ) content = '<p>'.join([i.replace("\n", '').strip() for i in text ]).replace("<p><p>", '<p>').replace("<p>,<p>", ' ') return content def download_pic(self, img_url_list): img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] pic_url_list_1 = [i for i in img_url_list if '.svg' not in i] pic_url_list = [j for j in pic_url_list_1 if 'ops=100_100' not in j] miss_pic = 'https://img.huffingtonpost.com/asset/default-missing-image.jpg?cache=jio2vozgty&ops=scalefit_970_noupscale' if pic_url_list == [] or miss_pic in pic_url_list: return None else: if len(pic_url_list) < 18: pic_list = pic_url_list else: pic_list = pic_url_list[:17] for pic_url in pic_list: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img def analysis_download_img(self, html): pic_url_list1 = html.xpath( '//div[@class="listicle__slide-content"]/img/@src') if pic_url_list1 == []: pic_url_list2 = html.xpath( '//div[@class="entry__body js-entry-body"]//img/@src') if pic_url_list2 == []: pic_url_list3 = html.xpath( '//div[@class="collection-item image"]//img/@src') return pic_url_list3 else: return pic_url_list2 else: return pic_url_list1
class TopBuzz_News(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537. 36' } self.cookies = { 'cookies': 'odin_tt=25f29c3c11ab624e32ea123b341f8e8ad3b9254cb1bcb00828ea8bbdf642ee3018a6a10f8ce2d4c3bb22af93a7fbcf4f44f76469931ce1241c8907041d196a1c; tt_webid=6675470162378032646; __tea_sdk__user_unique_id=6675470162378032646; __tea_sdk__ssid=f4cef532-3e68-4425-a4fa-8963bda2fdc3; csrf-token=da1ad8433b7acb6730721e47b072bc7ec710c4e3; csrf-secret=QBi0atkMP4iR2oosQVsHoAxAo7LA2Qzm' } self.keyword = [ 'foryou', 'entertainment', 'sports', 'lifestyle', 'gaming', 'food', 'tech', 'autos' ] self.t = time.time() self.downloadPath = '/data/crawler' self.picPath = '/topbuzz/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): for cls in self.keyword: print('cls:\t', cls) url = 'https://www.topbuzz.com/pgc/feed?content_space=bd&language=en®ion=us&user_id=6675470162378032646' \ '&channel_name=' + cls + \ '&classification=all' \ '&max_behot_time=' + str(self.t) self.parsing_topBuzz_list_page(list_url=url) def parsing_topBuzz_list_page(self, list_url): res = requests.get(url=list_url, headers=self.headers, cookies=self.cookies).text data = json.loads(res) item = data['data']['feed']['items'] for i in range(len(item)): group_id = item[i]['group_id'] impr_id = item[i]['impr_id'] user_id = item[i]['author_info']['user_id'] detail_url = 'https://www.topbuzz.com/a/' \ + group_id + \ '?app_id=1106' \ '&gid=' + group_id + \ '&impr_id=' + impr_id + \ '&language=en' \ '®ion=us' \ '&user_id=' + user_id + \ '&c=sys' status = self.filter.filter_data(details_url=detail_url) if status: print('Data already exists!') else: self.parsing_details_page(details_url=detail_url) def parsing_details_page(self, details_url): time.sleep(random.uniform(1, 3)) res = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text html = etree.HTML(res) source = int(1) jobId = time.time() sourceUrl = details_url title = ''.join(html.xpath('//div[@class="title"]/text()')) authorName = ''.join(html.xpath('//div[@class="name active"]/text()')) releaseTime = ''.join(html.xpath('//div[@class="publishTime"]/text()')) content = self.parsing_news_content(res=res, html=html, newspaper=True) img = self.download_img(html=html) if img is None or img == '' or content is None or content == '': pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='topBuzz') def parsing_news_content(self, res, html, newspaper=False): try: if newspaper: text = fulltext(res).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content else: text = html.xpath( '//div[@class="editor-container"]//p//text()') content = '<p>'.join([ i.replace("\n", '').strip() for i in text ]).replace("<p><p>", '<p>') return content except: return None def download_img(self, html): pic_url_list = html.xpath('//main//img//@src') img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: pass else: try: pic_list = [i for i in pic_url_list if 'https' not in i] for pic_url in pic_list[:17]: urllib.request.urlretrieve( 'https:' + pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None
class Associated_Press_News(object): def __init__(self): self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'} self.cookies = {'cookie': '_cb_ls=1; _cb=ChGdwsejPcBwqK1A; _ga=GA1.2.1067424464.1556266698; __gads=ID=b2804ef9280ce726:T=1556266708:S=ALNI_MbsZp6KMsLTd9MAhzM98UpWqF4sEQ; __qca=P0-112096547-1556266838413; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; GED_PLAYLIST_ACTIVITY=W3sidSI6Ilp4Q0YiLCJ0c2wiOjE1NTY2MTc5NjcsIm52IjowLCJ1cHQiOjE1NTY2MTc5NjAsImx0IjoxNTU2NjE3OTYwfV0.; _gid=GA1.2.1304411157.1557027854; _cb_svref=null; OptanonConsent=landingPath=NotLandingPage&datestamp=Sun+May+05+2019+11%3A44%3A56+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=4.1.0&EU=false&groups=0_140011%3A1%2C1%3A1%2C0_140010%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2C0_140046%3A1%2C0_140042%3A1%2C0_140038%3A1%2C0_140034%3A1%2C0_140055%3A1%2C0_140051%3A1%2C0_140047%3A1%2C0_140043%3A1%2C0_140039%3A1%2C0_140035%3A1%2C0_140031%3A1%2C0_140052%3A1%2C0_140048%3A1%2C0_140044%3A1%2C0_140040%3A1%2C0_140036%3A1%2C0_140032%3A1%2C0_140053%3A1%2C0_140049%3A1%2C0_140045%3A1%2C0_140041%3A1%2C0_140037%3A1%2C0_140033%3A1%2C0_140054%3A1%2C0_140050%3A1%2C101%3A1%2C102%3A1%2C103%3A1%2C104%3A1%2C105%3A1%2C106%3A1%2C107%3A1%2C108%3A1%2C109%3A1%2C110%3A1%2C111%3A1%2C112%3A1%2C113%3A1%2C114%3A1%2C115%3A1%2C116%3A1%2C117%3A1%2C118%3A1%2C119%3A1%2C120%3A1%2C121%3A1%2C122%3A1%2C123%3A1%2C124%3A1%2C125%3A1%2C126%3A1%2C127%3A1%2C128%3A1%2C129%3A1%2C130%3A1%2C131%3A1%2C132%3A1%2C133%3A1%2C134%3A1%2C135%3A1%2C136%3A1%2C137%3A1%2C138%3A1%2C139%3A1%2C140%3A1%2C141%3A1%2C142%3A1%2C143%3A1%2C144%3A1%2C145%3A1%2C146%3A1%2C147%3A1%2C148%3A1%2C149%3A1%2C150%3A1%2C151%3A1%2C152%3A1%2C153%3A1%2C154%3A1%2C155%3A1&AwaitingReconsent=false; _tb_sess_r=; _tb_t_ppg=https%3A//apnews.com/245117b7dafd4790ba3d51db06cf345a; _gat=1; _chartbeat2=.1556266696382.1557028669628.1111100001.Vfd8vwJvnJujXq7Dq7JmkgXZfl.4'} self.downloadPath = '/data/crawler' self.picPath = '/ap_news/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): news_dic = { 'top' : 'https://apnews.com/apf-topnews', 'sport' : 'https://apnews.com/apf-sports', 'entertainment' : 'https://apnews.com/apf-entertainment', 'travel' : 'https://apnews.com/apf-Travel', 'technology' : 'https://apnews.com/apf-technology', 'lifestyle' : 'https://apnews.com/apf-lifestyle', 'business' : 'https://apnews.com/apf-business', 'usNews' : 'https://apnews.com/apf-usnews', 'health' : 'https://apnews.com/apf-Health', 'science' : 'https://apnews.com/apf-science', 'intlNews' : 'https://apnews.com/apf-intlnews', 'politics' : 'https://apnews.com/apf-politics', } for url in news_dic: print('newsUlr:\n', url) try: ap.parsing_news_list_page(news_start_url=news_dic[url]) except: pass def parsing_news_list_page(self, news_start_url): list_page_html = requests.get(url=news_start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(2, 5)) list_html_obj = etree.HTML(list_page_html) list_page_url = list_html_obj.xpath('//a[@class="headline"]/@href') list_url = ['https://apnews.com' + i for i in list_page_url if 'https://apnews.com' not in i] for details_url in list_url: result=self.filter.filter_data(details_url=details_url) if result: print('Data already exists!') else: self.parsing_details_page(details_url=details_url) def parsing_details_page(self, details_url): details_html = requests.get(url=details_url, headers = self.headers, cookies = self.cookies).text time.sleep(random.uniform(1, 3)) html_obj = etree.HTML(details_html) source = int(4) sourceUrl = details_url jobId = time.time() title = ''.join(html_obj.xpath('//div[@class="headline"]//h1/text()')) authorName = ''.join(html_obj.xpath('//span[@class="byline"]/text()')) releaseTime = ''.join(html_obj.xpath('//span[@class="Timestamp"]/@data-source')) content = self.parsing_news_content(content_html =details_html) img_urls = html_obj.xpath('//a[@class="LeadFeature LeadFeature_gallery"]/@href') if img_urls == [] or img_urls is None: pass else: img = self.download_picture(html=details_html) if img is None or img == '': pass else: data = {'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img} print('data:\n', data) self.save.save_data(data=data, news='ap') def parsing_news_content(self, content_html): text = fulltext(content_html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content def download_picture(self, html): try: url_list = self.analysis_pic_url(html=html) img_id = str(uuid.uuid4()).replace('-','') index = 1 img_list = [] for pic_url in url_list[:17]: urllib.request.urlretrieve(pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append(r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: pass def analysis_pic_url(self, html): html_script = r'<script>(.*?)</script>' script = re.findall(html_script, html, re.S | re.M) mediumIds_rule = r'mediumIds(.*?)]' rule = re.compile(mediumIds_rule) result = rule.findall(script[0])[0][3:] result = "[" + result + "]" js = json.loads(result) url_list = [] for i in js: url = 'https://storage.googleapis.com/afs-prod/media/' + i + '/' + '600.jpeg' url_list.append(url) return url_list
class BBC_News(): def __init__(self): self.news_api = NewsApiClient( api_key='cb7a4ae15a98429890aeedb9a7b460a0') self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookie': 'ckns_orb_fig_cache={%22ad%22:1%2C%22ap%22:4%2C%22ck%22:0%2C%22eu%22:0%2C%22uk%22:0}; ckns_sa_labels_persist={}; ckns_sscid=7f5aa895-8a47-4928-8632-ae8118032bcb; _cb_ls=1; _cb=CF1-z-CgoNtbBVcmNK; ckns_eds=INS-vt29-666188954:923108334-1556503556; ckns_settings-nonce=FwTiPYjnehKUtBu4zb7oIJ4j; amlbcookie=01; ckns_mvt=8c10379c-2f9b-44e6-a88b-97b0315adccb; ckns_account_experiments=j%3A%7B%22accxp_marketing_opt_in_2%22%3A%22control%22%7D; AWSELB=0FC55D47187ECE9190E70C0A017AC69A844CA844E9727B10D1C45E9E505E11A5757E62A62559CE5ECC76BE5C0D98ACC5FFDFADB0DF8505DDE5C427CC6C744FDB90DA13BB15F2555DE48D9361FEFE0FBEA45595E8C7; ckns_stateless=1; ckns_nonce=nzFw9J2FPDEn17WnPYS0LnRO; ckns_id=eyJhYiI6Im8xOCIsImVwIjp0cnVlLCJldiI6ZmFsc2UsInBzIjoicHVmZjhMV3pjSUlfckQ3RlkwaVo1V0dsM3czbFdBWDQ0TmVXNktKYjdDMCIsInNlcy1leHAiOjE1NTY1MTgzOTIwMDAsImp3dC1leHAiOjE2MTk1ODk0OTIwMDAsInRrbi1leHAiOjE1NTY1MjExMzEwMDAsInJ0a24tZXhwIjoxNjE5NTg5NDkyMDAwfQ; ckns_atkn=eyJ0eXAiOiJKV1QiLCJ6aXAiOiJOT05FIiwiYWxnIjoiSFMyNTYifQ.eyJzdWIiOiIzY2RiOWVkOC01ZjdmLTRlZWEtODYxNS1jMzZmMTdhZjZkMzEiLCJjdHMiOiJPQVVUSDJfU1RBVEVMRVNTX0dSQU5UIiwiYXV0aF9sZXZlbCI6MiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MCIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiYWNjZXNzX3Rva2VuIiwidG9rZW5fdHlwZSI6IkJlYXJlciIsImF1dGhHcmFudElkIjoidlhTaHVESDJRc3BOTTItZ0d3ek4yYlBJczRRIiwiYXVkIjoiQWNjb3VudCIsIm5iZiI6MTU1NjUxNzUzMSwiZ3JhbnRfdHlwZSI6InJlZnJlc2hfdG9rZW4iLCJzY29wZSI6WyJleHBsaWNpdCIsImltcGxpY2l0IiwicGlpIiwidWlkIiwib3BlbmlkIl0sImF1dGhfdGltZSI6MTU1NjUxNzQ5MSwicmVhbG0iOiIvIiwiZXhwIjoxNTU2NTI0NzMxLCJpYXQiOjE1NTY1MTc1MzEsImV4cGlyZXNfaW4iOjcyMDAsImp0aSI6IkZXcExhak13bmYxdUJyRWMtY0xaNnlpTUE1cyJ9.OhaC7wNmB_bALESjcJH8JjKcGRa-WaGkcZGWS0rhAtg; ckns_idtkn=eyJ0eXAiOiJKV1QiLCJraWQiOiJIa2d0WDBJd3RDOStSVGQvOWdYdFN0bk9VaU09IiwiYWxnIjoiUlMyNTYifQ.eyJhdF9oYXNoIjoiNGFfU2tJMWtQaVVZbks2VGlNSm9BdyIsInN1YiI6IjNjZGI5ZWQ4LTVmN2YtNGVlYS04NjE1LWMzNmYxN2FmNmQzMSIsImFiIjoibzE4IiwiYXVkaXRUcmFja2luZ0lkIjoiNmJmNjBhOTAtMzdiZS00MjE2LWIyOWQtNWI4NDFmZjA2Y2RmLTM2MTkxOTY1MSIsImlzcyI6Imh0dHBzOi8vYWNjZXNzLmFwaS5iYmMuY29tL2JiY2lkdjUvb2F1dGgyIiwidG9rZW5OYW1lIjoiaWRfdG9rZW4iLCJhdWQiOiJBY2NvdW50IiwiYWNyIjoiMCIsImF6cCI6IkFjY291bnQiLCJhdXRoX3RpbWUiOjE1NTY1MTc0OTEsInJlYWxtIjoiLyIsImV4cCI6MTU1NjUyMTEzMSwidG9rZW5UeXBlIjoiSldUVG9rZW4iLCJpYXQiOjE1NTY1MTc1MzF9.LqYjXmcfMMVfB3UupPV8oqez0gojKu-9anW-73WVKXOS5deEbwwYMrTr8JQy85WhwzlZNA5e8eqLPWJ_lAgfjCiw60zdYMxM_x_ZYaHtpPtAXf0SCOD8FlTBKnRZYDqKNkj8F22ctDqPUqrRrN-tDVTqrVMYW38sHqBeXalUGkw-2C24UBlE4DFcDqeqjn0pOFbwuFyQpgwrwp1y6UyUvF3WhuB6GVIkkKNUgYbWpnHTmP9OD8DNM_MH9TLDaC9SoRE5py51CpkZ78Y4rnQUAHeHibjbOwLKQkadVGhFzxr4vzxwJlRj_nrCySmrplgDJ7a9P_raVKfL4JH6UeA1_A; atuserid=%7B%22name%22%3A%22atuserid%22%2C%22val%22%3A%222a12b799-e590-476f-9b23-800e48e162f4%22%2C%22options%22%3A%7B%22end%22%3A%222020-05-30T05%3A59%3A40.931Z%22%2C%22path%22%3A%22%2F%22%7D%7D; _chartbeat2=.1556007561538.1556517584378.1000001.BzGCuqD5hQB-BMGlTiNhxZyCFMP2O.1; _cb_svref=https%3A%2F%2Fwww.bbc.com%2Fnews; ckps_id_ptrt=https%3A%2F%2Fwww.bbc.co.uk%2Fprogrammes%2Fw172wy08d8yw9mq; ecos.dt=1556517629804' } self.t = time.time() self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t)) self.keyword = [ 'News', 'Health', 'Science', 'Entertainment', 'Technology' ] self.downloadPath = '/data/crawler' self.picPath = '/bbc_news/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): bbc.parsing_bbc_news_list() def parsing_bbc_news_list(self): today = self.point_time for kw in self.keyword: print('keyword:\n', kw) news_list = self.news_api.get_everything(q=kw, sources='bbc-news', domains='bbc.co.uk', from_param=today, to=today[:-1] + str(int(today[-1]) - 1), language='en', sort_by='relevancy', page_size=100) self.parsing_news_list_url(news_list=news_list) def parsing_news_list_url(self, news_list): articles = news_list['articles'] for i in range(len(articles)): details_url = articles[i]['url'] if 'www.bbc.co.uk' in details_url: result = self.filter.filter_data(details_url=details_url) if result: print('Data already exists!') else: details_res = requests.get(details_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 5)) html_obj = etree.HTML(details_res) source = int(6) sourceUrl = details_url jobId = time.time() authorName = articles[i]['source']['name'] releaseTime = articles[i]['publishedAt'] title_source = articles[i]['title'] title = self.parsing_news_title(html=html_obj, title_source=title_source) thumbnail_img = articles[i]['urlToImage'] img = self.download_img(html=html_obj, thumbnail_img=thumbnail_img) content = self.parsing_news_content( content_html=details_res) if content == 'Sign in to the BBC, or Register' or content is None or img is None or img == '': pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='bbc') def parsing_news_title(self, html, title_source): title = ''.join(html.xpath('//h1[@class="story-body__h1"]/text()')) if title == '' or title is None: return title_source else: return title def parsing_news_content(self, content_html): text = fulltext(content_html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content def download_img(self, html, thumbnail_img): try: pic_list_1 = html.xpath( '//span[@class="image-and-copyright-container"]/img/@src') pic_list_2 = html.xpath( '//div[@class="js-delayed-image-load"]/@data-src') pic_list_3 = [i for i in pic_list_2 if '320' in i] pic_list = pic_list_1 + pic_list_3 img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] pic_url_list = [i for i in pic_list if 'png' not in i] if pic_url_list == []: urllib.request.urlretrieve( thumbnail_img, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img = r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)) return img else: for pic_url in pic_url_list: if '320' in pic_url: url = pic_url.replace("320", '660') urllib.request.urlretrieve( url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 else: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None
class ABC_News(object): def __init__(self): self.news_api = NewsApiClient( api_key='e7d5104fc5c74e259dbe2427b68257fb') self.key_word = [ 'U.S.', 'Lifestyle', 'Technology', 'Entertainment', 'Sports', 'Health' ] self.t = time.time() self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t)) self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookie': 'cookieMonster=1; _cb_ls=1; SWID=522fc1e1-4ffd-4802-86fa-d7475f8dca57; optimizelyEndUserId=oeu1557122779744r0.05588715173473946; s_vi=[CS]v1|2E67E7728507B2CE-4000011580009D3A[CE]; __gads=ID=b52dc242d5ad893e:T=1557122797:S=ALNI_MaWnnuLLKP88qrPEsPEVuViaoGlJg; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; UNID=0df479d5-1639-4404-b6a8-36d731a7876d; _cb=Dz5K21B0CX5JDkjMG; _v__chartbeat3=DbUUrKDDGTaEDqauaQ; _cb_svref=null; AkamaiAnalytics_BrowserSessionId=4d41ba72-5ab2-8f33-46aa-f5748aca9647; HTML_VisitIntervalStartTime=1557125661015; s_sess=%20s_cc%3Dtrue%3B%20s_sq%3D%3B; adnum=3undefined; _chartbeat2=.1557122809880.1557125676423.1.DoKBmGC2OW5QDWek5PEERi8oZtYZ.12; HTML_BitRateBucketCsv=0,19083,16715,0,0,0,0,0; HTML_VisitValueCookie=1|1|1|0|35798|35826|0|0|0|0|0|0|NaN; s_pers=%20s_fid%3D22C0AF24132A0778-001FDBB2DA7591AD%7C1620284117936%3B%20s_c20%3D1557125717941%7C1651733717941%3B%20s_c20_s%3DFirst%2520Visit%7C1557127517941%3B; HTML_isPlayingCount=2; GED_PLAYLIST_ACTIVITY=W3sidSI6IlhYU0wiLCJ0c2wiOjE1NTcxMjU3MzgsIm52IjowLCJ1cHQiOjE1NTcxMjU1NDQsImx0IjoxNTU3MTI1NjM3fSx7InUiOiIzbnlXIiwidHNsIjoxNTU3MTI1NzM3LCJudiI6MCwidXB0IjoxNTU3MTI1NTU2LCJsdCI6MTU1NzEyNTYyM30seyJ1IjoiWG81TSIsInRzbCI6MTU1NzEyNTczNywibnYiOjEsInVwdCI6MTU1NzEyNTU4NSwibHQiOjE1NTcxMjU3MzV9XQ..; HTML_VisitCountCookie=1' } self.downloadPath = '/data/crawler' self.picPath = '/abc_news/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): self.parsing_abc_news_list() def parsing_abc_news_list(self): today = self.point_time for kw in self.key_word: print('keyword:\t', kw) news_list = self.news_api.get_everything( q=kw, sources='abc-news', domains='abcnews.go.com', from_param=today, to=today[:-1] + str(int(today[-1]) - 1), language='en', sort_by='relevancy', page_size=100, ) self.parsing_news_list_url(news_list=news_list) def parsing_news_list_url(self, news_list): articles = news_list['articles'] for i in range(len(articles)): details_url = articles[i]['url'] result = self.filter.filter_data(details_url=details_url) if result: print('Data already exists!') else: time.sleep(random.uniform(1, 3)) details_res = requests.get(details_url, headers=self.headers, cookies=self.cookies).text html_obj = etree.HTML(details_res) source = int(2) sourceUrl = details_url jobId = time.time() authorName = articles[i]['source']['name'] releaseTime = articles[i]['publishedAt'] title_source = articles[i]['title'] title = self.parsing_news_title(html_obj=html_obj, title_source=title_source) content = self.parsing_news_content(content_html=details_res, html_obj=html_obj, newspaper=True) thumbnail_img = articles[i]['urlToImage'] img = self.download_img(html_obj=html_obj, thumbnail_img=thumbnail_img) if img is None or img == '' or content is None or content == '': pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='abc') def parsing_news_title(self, html_obj, title_source): title = ''.join( html_obj.xpath('//header[@class="article-header"]//h1/text()')) if title == '' or title is None: return title_source else: return title def parsing_news_content(self, content_html=None, html_obj=None, newspaper=False): if newspaper: text = fulltext(content_html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) else: content_list = html_obj.xpath( '//div[@id="news-content"]//p/text()') content = '<p>'.join([ i.replace("\n", '').strip() for i in content_list ]).replace("<p><p>", '<p>') return content def download_img(self, html_obj, thumbnail_img): try: pic_url_list = html_obj.xpath('//figure//div//picture//img/@src') img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: urllib.request.urlretrieve( thumbnail_img, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img = r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)) return img else: for pic_url in pic_url_list[:17]: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: pass
class Smart_News(): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } self.downloadPath = '/data/crawler' self.picPath = '/smartNews/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): index = 1 while index < 3: url_dic = { 'news': 'https://www.smithsonianmag.com/category/smart-news/?no-ist%252F=938&page={}' .format(index), 'history': 'https://www.smithsonianmag.com/category/history/?page={}'. format(index), 'science': 'https://www.smithsonianmag.com/category/science-nature/?page={}' .format(index), 'innovation': 'https://www.smithsonianmag.com/category/innovation/?page={}'. format(index), 'arts_culture': 'https://www.smithsonianmag.com/category/arts-culture/?page={}' .format(index), 'travel': 'https://www.smithsonianmag.com/category/travel/?page={}'. format(index), 'smithsonian': 'https://www.smithsonianmag.com/category/smithsonian-institution/?page={}' .format(index), } for kw in url_dic: print('keyword:\t', kw) self.parsing_smart_news_list_page(url_dic[kw]) index += 1 def parsing_smart_news_list_page(self, list_url): time.sleep(random.uniform(5, 10)) list_res = requests.get(url=list_url, headers=self.headers).text html = etree.HTML(list_res) thumbnail_img_list = html.xpath('//main[@class="main"]//img//@src') details_url_list = html.xpath('//h3[@class="headline"]//a/@href') for i in range(len(details_url_list)): result = self.filter.filter_data( details_url='https://www.smithsonianmag.com' + details_url_list[i]) if result: print('Data already exists!') else: data = self.parsing_details_page( details_url='https://www.smithsonianmag.com' + details_url_list[i], thumbnail_img=thumbnail_img_list[i]) if data is None or data == '': pass else: print('data:\n', data) self.save.save_data(data=data, news='smart') def parsing_details_page(self, details_url, thumbnail_img): time.sleep(random.uniform(3, 5)) details_res = requests.get(url=details_url, headers=self.headers).text details_html = etree.HTML(details_res) source = int(5) sourceUrl = details_url jobId = time.time() title = ''.join(details_html.xpath('//h1[@class="headline"]/text()')) if title is None or title == '': pass else: text = fulltext(details_res).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) author = details_html.xpath('//a[@class="author-name"]/text()') authorName = ''.join( [i.replace("/n", '<p>').strip() for i in author]) releaseTimeList = details_html.xpath( '//time[@class="pub-date"]/text()') releaseTime = ''.join( [i.replace("/n", '<p>').strip() for i in releaseTimeList]) img = self.analysis_filter_img_url(html=details_html, thumbnail_img=thumbnail_img) if img is None or img == '' or content is None or content == '': pass else: return { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } def analysis_filter_img_url(self, html, thumbnail_img): href_list = html.xpath('//main[@class="main"]//img//@src') pic_url_list = [ i for i in href_list if 'filer' in i and 'png' not in i ] img = self.download_img(pic_url_list=pic_url_list) if img == '' or img is None: img = self.download_img(pic_url_list=[thumbnail_img]) return img else: return img def download_img(self, pic_url_list): img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] try: for pic_url in pic_url_list[:17]: if '220x130' in pic_url or '60x60' in pic_url: pass else: response = requests.get(pic_url) image = Image.open(BytesIO(response.content)) image.save(r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None
class FOX_News(object): def __init__(self): self.news_api = NewsApiClient(api_key='f04f7a8db32841299d4a7fae723e61b2') self.t = time.time() self.point_time = time.strftime('%Y-%m-%d', time.localtime(self.t)) self.keyword = ['us', 'word', 'opinion', 'politics', 'entertainment', 'lifestyle', 'health', 'travel', 'autos'] self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} self.cookies = {'cookie': '_cb_ls=1; optimizelyEndUserId=oeu1556269407120r0.4256555044820445; cto_lwid=a3569f8e-fd62-48fd-8cf3-52e3a3d49218; _gcl_au=1.1.1392012605.1556269408; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22cfa5a6d1-cac6-4a48-97ed-e2a25488a94a%22; _ga=GA1.2.353904812.1556269412; _cb=D6-ViRhsUuoBSGama; __gads=ID=0a226a472ca026e8:T=1556269422:S=ALNI_Mb8qEqiRmqgHFem87cBOSEiCTTaJQ; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; _scid=47caae3a-e216-48d9-8cdc-3159238a7671; FXN_flk=1; AMCVS_17FC406C5357BA6E0A490D4D%40AdobeOrg=1; _gid=GA1.2.1114110874.1557801782; s_cc=true; _csrf=qWmVWRGxKfzqXCxI9_yuGfZI; s_sq=%5B%5BB%5D%5D; AKA_A2=A; ak_bmsc=3362DC65CD8C5F6FE2F5F2E24D7DD7FE6876060DED3200004C65DA5C0E141B34~pl9V8ncmx0JI/913nUJgfYoKX6Gte64URfMw4gBpTiaQPEzpKVnyOxRIc/NBeHS9HwdJZ+Fd5cB6oDFLpRNLt93qTu4fSjWuP7e+PZea5EArlAr63c0rHI5P+U7hKycyZfvpMt2MSsmqLqtUqZqavEQxBprGj74WIJ0a5ZnH2vSP1CYH+4ijzZPqw/REPx+WlZ+jHCptyFj7C9pjBHstMpWmr4RW6NTHMwyBsckJbiQr0p+5gPNq/FUjz06HN7q/b4; _cb_svref=null; AMCV_17FC406C5357BA6E0A490D4D%40AdobeOrg=2121618341%7CMCIDTS%7C18031%7CMCMID%7C37985443320715041480395091296536963184%7CMCAAMLH-1557842971%7C7%7CMCAAMB-1558421455%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1557823855s%7CNONE%7CMCAID%7CNONE; s_pers=%20s_ppn%3Dfnc%253Aroot%253Aroot%253Achannel%7C1557806491239%3B%20omtr_lv%3D1557816723185%7C1652424723185%3B%20omtr_lv_s%3DLess%2520than%25201%2520day%7C1557818523185%3B%20s_nr%3D1557816723191-Repeat%7C1560408723191%3B; _chartbeat2=.1556269420027.1557816723254.0000000010000001.CY0VlWCO9QUgDFFbO8QLxoyCPj7ho.2; s_sess=%20omtr_evar17%3DD%253Dc17%3B%20s_ppvl%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C22%252C83%252C5886%252C1920%252C925%252C1920%252C1080%252C1%252CL%3B%20SC_LINKS%3D%3B%20s_ppv%3Dfnc%25253Aworld%25253Asubsection%25253Aarticle%252C63%252C96%252C3550%252C1920%252C969%252C1920%252C1080%252C1%252CL%3B; criteo_write_test=ChUIBBINbXlHb29nbGVSdGJJZBgBIAE; bm_sv=8A6F070ED17B9F85AD022D562A830573~oN82OtrVhgL99OXQYjpsFWPKOuwBoUVwy60qge23Kx9pNN2MIe3/AhQZJZ+na42MjDAIyCRuvDS6csM6csNzVnCY/0Ue7dXJIHzFvEjq/KcL+5X57fiZK5b9W/W3g/hw1kSCvVxA/GNO4h9IlDmY6OElMgVSqN2h9kq42m6z+n0='} self.downloadPath = '/data/crawler' self.picPath = '/fox_news/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): self.parsing_fox_news_list() def parsing_fox_news_list(self): today = self.point_time for kw in self.keyword: print('keyword:\t', kw) news_list = self.news_api.get_everything(q=kw, sources='fox-news', domains='foxnews.com', from_param=today, to=today[:-1] + str(int(today[-1]) - 1), language='en', sort_by='relevancy', page_size=100, ) self.parsing_fox_news_list_url(news_list=news_list) def parsing_fox_news_list_url(self, news_list): articles = news_list['articles'] for i in range(len(articles)): details_url = articles[i]['url'] result = self.filter.filter_data(details_url=details_url) if result: print('Data already exists!') else: details_res = requests.get(details_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) html_obj = etree.HTML(details_res) source = int(9) sourceUrl = details_url jobId = time.time() author = articles[i]['source']['name'] authorName = self.parsing_author_name(html_obj=html_obj, name_source=author) releaseTime = articles[i]['publishedAt'] title = articles[i]['title'] content = self.parsing_news_content(content_html=details_res, html_obj=html_obj, newspaper=True) thumbnail_img = articles[i]['urlToImage'] img = self.download_img(html_obj=html_obj, thumbnail_img=thumbnail_img) if img is None or img == '' or content is None or content == '': pass else: data = {'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img} print('data:\n', data) self.save.save_data(data=data, news='fox') def parsing_author_name(self, html_obj, name_source): authorName = ''.join(html_obj.xpath('//div[@class="author-byline"]//span/span//text()')) if authorName == '' or authorName is None: return name_source else: return authorName def parsing_news_content(self, content_html=None, html_obj=None, newspaper=False): try: if newspaper: text = fulltext(content_html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) txt_list = [] for i in txt: if i.isupper(): pass else: txt_list.append(i) content = '<p>'.join(txt_list) else: content_list = html_obj.xpath('//div[@class="article-body"]//p//text()') content = '<p>'.join([i.replace("\n", '').strip() for i in content_list]).replace("<p><p>", '<p>') return content except: pass def download_img(self, html_obj, thumbnail_img): pic_url_list = html_obj.xpath('//div[@class="article-body"]//img/@src') img_id = str(uuid.uuid4()).replace('-','') index = 1 img_list = [] try: if pic_url_list == []: urllib.request.urlretrieve(thumbnail_img, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img = r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)) return img else: for pic_url in pic_url_list[:17]: urllib.request.urlretrieve(pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append(r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None
class Medium_News(): def __init__(self): self.headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'} self.cookies = {'cookie': '__cfduid=d6ba6448200002747444269a19593dbdd1555908016; __cfruid=985eba5fa2a449247bfd0598c1c1c5ec968a9416-1558490711; _ga=GA1.2.1064338879.1558490714; _gid=GA1.2.136804405.1558490714; lightstep_guid/medium-web=8f0cd65b0ef4abdb; lightstep_session_id=fcac5cf910466bc4; pr=1; tz=-480; uid=3314454e53ae; sid=1:4N4F93p0H1gPvFCIGldZUdIdQeFiifNF6stzqPFyBikCsGpjcmnIyu/NNWwIVVTx; xsrf=89TsRPcZaZKu; lightstep_guid/lite-web=7d9b16045b97b840; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://medium.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1558512703778%2C%22slts%22:1558503751909}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:3%2C%22last_session_ts%22:1558512703778}; sz=1905'} self.downloadPath = '/data/crawler' self.picPath = '/huffpost/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): news_dict = { 'topic':'https://medium.com/topic/editors-picks', 'technology':'https://medium.com/topic/technology', 'startups':'https://medium.com/topic/startups', 'self':'https://medium.com/topic/self', 'politics':'https://medium.com/topic/politics', 'health':'https://medium.com/topic/health', 'design':'https://medium.com/topic/design', 'art':'https://medium.com/topic/art', 'beauty':'https://medium.com/topic/beauty', 'humor':'https://medium.com/topic/humor', 'fiction':'https://medium.com/topic/fiction', 'media':'https://medium.com/topic/social-media', 'crime':'https://medium.com/topic/true-crime', # 'comics':'https://medium.com/topic/comics', } for i in news_dict: self.parsing_medium_topic_list_page(url=news_dict[i]) news_list = { 'elemental':'https://medium.com/elemental-by-medium', 'heated':'https://heated.medium.com/', 'human':'https://medium.com/human-parts', } for j in news_list: self.parsing_medium_other_list_page(url=news_list[j]) def parsing_medium_topic_list_page(self, url): html = requests.get(url=url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3 ,5)) html_script = r'<script>(.*?)</script>' script = re.findall(html_script, html, re.S | re.M) mediumUrl_rule = r'"mediumUrl":"(.*?)"' rule = re.compile(mediumUrl_rule) result = rule.findall(script[4]) for i in result: details_url = i.replace(r'\u002F', '/') self.parsing_details_page(details_url=details_url) def parsing_medium_other_list_page(self, url): res = requests.get(url=url, headers=self.headers, cookies=self.cookies).text html = etree.HTML(res) list_page_urls = html.xpath('//div[@class="u-lineHeightBase postItem"]/a/@href') for details_url in list_page_urls: self.parsing_details_page(details_url=details_url) def parsing_details_page(self, details_url): status = self.filter.filter_data(details_url=details_url) if status: print('Data already exists!') else: res = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) html = etree.HTML(res) source = int(8) jobId = time.time() sourceUrl = details_url title =''.join(html.xpath('//div[@class="section-content"]//h1//text()')) if title == '' or title is None: pass else: authorName = ''.join(html.xpath('//div[@class="u-paddingBottom3"]/a/text()')) releaseTime = ''.join(html.xpath('//time/text()')) content = self.analysis_news_content(html=res, obj=html, newspaper=False) img = self.analysis_news_img(obj=html) if img is None or img == '' or content is None or content == '': pass else: data = {'source': source,'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img} print('data:\n', data) self.save.save_data(data=data, news='medium') def analysis_news_content(self, html, obj, newspaper=True): if newspaper: text = fulltext(html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content else: content_list= obj.xpath('//div[@class="section-content"]//text()')[7:] content = '<p>'.join([i.replace("\n", '').strip() for i in content_list]).replace("<p><p>", '<p>') return content def analysis_news_img(self, obj): try: pic_url_list = obj.xpath('//img[@class="progressiveMedia-image js-progressiveMedia-image"]/@data-src') img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: return None else: for pic_url in pic_url_list[:17]: response = requests.get(pic_url) image = Image.open(BytesIO(response.content)) image.save(r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append(r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None
class Techcrunch_News(object): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } self.cookies = { 'cookie': 'rxx=293dmvskaws.1i9frk99&v=1; _ga=GA1.2.893461412.1556504122; _fbp=fb.1.1556504132562.1028440015; __pnahc=0; __pat=-25200000; OTH=v=1&d=eyJraWQiOiIwMTY0MGY5MDNhMjRlMWMxZjA5N2ViZGEyZDA5YjE5NmM5ZGUzZWQ5IiwiYWxnIjoiUlMyNTYifQ.eyJjdSI6eyJndWlkIjoiUDJCVVRPR0RVT0VGVERUV0pNTFdKNFlFSDQiLCJzaWQiOiJEYTNsbjdKbG11MmwifX0.HoiBv5OlQvNY2x6q-LJBN-VzgCErT7GTCnODqLLQ8foasqTVUCPVXvwHFniFc7CwCf0n7lmSgfrSycQNevIFSJHZ7M-S9SRQH4FMtu91qykbuvAzAOQZRw_iz_warZWFJtpIys0EVH4Gn9wYqaqLXv-5lO39fuPsqJx9z7X6luQ; BX=e7bndb9eccnhl&b=4&d=lrdDlyNpYELw7nQr45ylAA--&s=8v&i=_g5gVQJJ34nc.9WZ3JGN; GUC=AQEAAQJc0SBdu0IgxgTZ&s=AQAAAJQ2Kk5V&g=XM_RDg; __tbc=%7Bjzx%7DjGAToaZMxJYLoS7N4KRjDaxHalABoj31MSFHkZP0UNxHLBrPMu6clUAaZwsaHnUnQQaMDnEIRO1fDpAMrkMVflCNhUFWsFFB8n1hsUBhKEKL38bZEAUprS1G6wPj4GNM4bchi9l7YPvr6or9wrNMLmWzw2hPXY5j7UVUWDOUH_U; __pcvc={}; _parsely_session={%22sid%22:6%2C%22surl%22:%22https://techcrunch.com/2019/05/04/uber-is-facing-australian-class-action-suit-alleging-unlawful-conduct/%22%2C%22sref%22:%22%22%2C%22sts%22:1558071232131%2C%22slts%22:1557885616789}; _parsely_visitor={%22id%22:%22pid=092447ecfa41ad2c2f2833a4997f1d2f%22%2C%22session_count%22:6%2C%22last_session_ts%22:1558071232131}; cmp=t=1558071232&j=0; _gid=GA1.2.1358424281.1558071235; _gat=1; __adblocker=false; xbc=%7Bjzx%7DYW6Rlvft6bPCfQyJ3DedvFReFNeSWzD34uqjUgyftdmRMMeJaQrGxlc0RnHslaNJuW923ovrMyh3fAAIY_x7R_Da15zP9YopEn3Om90NI0T5GRkVz40I1R8zV8ZQB68kBF2YuF_JsLshS1YKLFcyLSN12KbxNP4vrnBqkqtIO2yaJ5LoTRrcAPA64ePs4VtlokVTqGlotnhRSiMBSeplyP6M0a5Lj5rCIn1GIetfFxi-gIZuaMlkdAHSSmrqD1nfLBrQXcHSWrDRR0PGzzVvFjSVEXhIbldyChWDeAkkgN0hgI8KXA304yID8T-gx9UZiwWN897EFpRv3ZNtbg5IqW5GixrDYN1X7y_FdQGe5c4Tlz-figdB5Mbe5Qj2godX23QAk9Y6PbNudCC8Em1tgOzteL0CnIShQ--XvwA9qsvEZSWlAxrGfFmStXYiVaTRc1BM1DSemqPeEIoI_XtXT1h-FOYTaDZfqgflEl3Qb8MlWCowztRcnRznul-OxLIUMkPAraljlm83Bs9Z0ZZTeULOzew-rPTbrZfnXeQjr8OJtrUbNexMaJib654rgmNL7kXPxNmVdB1ZWX5IXEgmiW4XKjZACr0RxZbzXhXFfEN9gPbI7xVJJD8kfmfWoGW_0O6MebIrRbW8xxFPLY90Mw; __pvi=%7B%22id%22%3A%22v-2019-05-17-13-33-55-875-qtmTHL61BOYny2co-1510a80d282f15b71b1e5f4d8bc358ee%22%2C%22domain%22%3A%22.techcrunch.com%22%2C%22time%22%3A1558072171993%7D' } self.downloadPath = '/data/crawler' self.picPath = '/techcrunch/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): pg = 1 while pg < 3: start_url = 'https://techcrunch.com/wp-json/tc/v1/magazine?page={}&_embed=true'.format( pg) self.parsing_tc_news_list_url(list_url=start_url) pg += 1 def parsing_tc_news_list_url(self, list_url): res = requests.get(url=list_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) article = json.loads(res) for i in range(len(article)): sourceUrl = article[i]["link"] status = self.filter.filter_data(details_url=sourceUrl) if status: pass else: releaseTime = article[i]["date_gmt"] header_info = { 'sourceUrl': sourceUrl, 'releaseTime': releaseTime } self.parsing_details_page(header_info=header_info) def parsing_details_page(self, header_info): jobId = time.time() source = int(14) sourceUrl = header_info['sourceUrl'] releaseTime = header_info['releaseTime'] res = requests.get(url=sourceUrl, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) html = etree.HTML(res) title = ''.join(html.xpath('//h1[@class="article__title"]/text()')) authorName = ''.join( html.xpath('//div[@class="article__byline"]/a/text()')).strip() content = self.parsing_news_content(content_html=res) img = self.download_img(html_obj=html) if img is None or img == '' or content is None or content == '': pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='techcrunch') def parsing_news_content(self, content_html=None): text = fulltext(content_html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content def download_img( self, html_obj, ): pic_url_list = html_obj.xpath( '//article[@class="article-container article--post "]//img/@src') img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: return None else: for pic_url in pic_url_list[:17]: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img
class Matador_Network(object): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } self.cookies = { 'cookie': '_ga=GA1.2.1006188425.1558506407; __auc=c07ed86116ade38958b6f215c90; __gads=ID=ebab27bbe751d3d9:T=1558506409:S=ALNI_MbKANVLVZlZmub7wcHXVdRq__9uAQ; _fbp=fb.1.1558506413908.2064431331; cache-primed=1; mn-push-status=8; EU=(null); _gid=GA1.2.549309516.1558921947; __asc=2b5b28f116af808ea5c6cf504f0' } self.downloadPath = '/data/crawler' self.picPath = '/matador_network/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): pg = 8 while pg < 68: start_url = 'https://matadornetwork.com/wp-content/plugins/matadornetwork/mn-ajax.php?component=post&action=get_posts&' \ 'offset={}' \ '&posts_per_page=20&grid=small&post__not_in%5B%5D=546093&post__not_in%5B%5D=%20546069&post__not_in%5B%5D=%20545872&post__not_in%5B%5D=%20497520&post__not_in%5B%5D=%20501737&post__not_in%5B%5D=%20486578&post__not_in%5B%5D=%20342847&home=1&_=1558941893778'.format(pg) self.parsing_matador_network_list_page(list_url=start_url) pg += 20 def parsing_matador_network_list_page(self, list_url): res = requests.get(url=list_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) js = json.loads(res) html = js['html'] html_obj = etree.HTML(html) urls_list = html_obj.xpath( '//a[@class="article__image-wrapper"]/@href') for details_url in urls_list: status = self.filter.filter_data(details_url=details_url) if status: print('Data already exists!') else: data = self.parsing_details_page_url(details_url=details_url) print('data:\t', data) self.save.save_data(data=data, news='matador') def parsing_details_page_url(self, details_url): res = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text html = etree.HTML(res) time.sleep(random.uniform(1, 3)) source = int(11) sourceUrl = details_url jobId = time.time() title = ''.join(html.xpath('//div[@class="container"]//h1/text()')) releaseTime = ''.join( html.xpath('//div[@class="post-info-date"]/text()')) authorName = ''.join( html.xpath('//a[@class="post-info-author"]/text()')) content = self.analysis_news_content(content_html=res, html_obj=html, newspaper=True) img = self.analysis_content_img(html_obj=html) if img is None or img == '' or content is None or content == '': pass else: return { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } def analysis_news_content(self, content_html=None, html_obj=None, newspaper=True): if newspaper: text = fulltext(content_html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) else: content_list = html_obj.xpath( '//div[@class="post-content"]//text()') content = '<p>'.join([ i.replace("\n", '').strip() for i in content_list ]).replace("<p><p>", '<p>') return content def analysis_content_img(self, html_obj): pic_url_list = html_obj.xpath('//div[@class="container"]//img/@src') pic_set = [i for i in pic_url_list if '.png' not in i] img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_set == []: return None else: for pic_url in pic_set: response = requests.get(pic_url) image = Image.open(BytesIO(response.content)) image.save(r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img
class Buzz_Feed_News(): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookie': '_ga=GA1.2.2006098489.1555559856; _fbp=fb.1.1555559860721.1190642659; __qca=P0-464700868-1555559857580; permutive-id=35526ebd-337f-4b00-bf5b-10a6610a85a5; __gads=ID=2cb4be529258fba6:T=1555559912:S=ALNI_MawMBKcEjsbSC3roAOVcCQm5lCB2A; _pdfps=%5B7684%2C13160%2C13164%2C13319%2C13730%2C14474%2C10166%2C12448%2C12449%2C12882%2C13097%2C13214%2C13217%2C13276%2C13278%2C13834%2C14353%2C10748%2C10788%2C13102%2C13144%2C13145%2C13146%2C13147%2C13150%2C13151%2C13157%2C13163%2C13169%2C13667%2C14437%2C14458%2C10224%2C10915%2C13153%2C13675%2C14142%2C13064%2C13216%2C13279%2C14431%2C14432%2C10749%2C10789%2C10906%2C10916%2C10917%2C11655%2C12233%2C12244%2C12679%2C12985%2C13099%2C13101%2C13148%2C13244%2C13741%2C13742%2C14143%2C14479%2C14872%2C15077%2C15128%2C15139%2C10222%2C13100%2C10216%2C%2212244-15-22969%22%2C%2212244-15-22970%22%2C%2212679-5-118997US%22%2C%2212985-5-118497US%22%2C%2213244-5-325997US%22%2C%2213245-5-325997US%22%2C%2213246-5-325997US%22%2C%2213458-15-22969%22%2C%2213458-15-22970%22%2C%2213459-15-22969%22%2C%2213459-15-22970%22%2C%2214229-5-318346US%22%2C%2214351-15-22835%22%2C%2214479-5-325547US%22%2C%2214872-15-22835%22%2C%2214872-15-22814%22%2C%2215063-5-318346US%22%2C%2215063-5-325346US%22%5D; permutive-session=%7B%22session_id%22%3A%2215c3b65a-580a-47aa-ab95-b44076421376%22%2C%22last_updated%22%3A%222019-04-27T02%3A27%3A38.962Z%22%7D; _cmpQcif3pcsupported=1; _gid=GA1.2.13310005.1557196067; _gat=1; sailthru_pageviews=4; sailthru_content=cbe347ea3dd8f028b2a79dd2124b2609d73dc57549ee138bd1d9dedee18e797c3cde4668fc0929097a33767e5b408948300e3683df34cf01dce50805bbb1306ce0bce460f7e70fed288b52d84bd9816499693f0167a253c9d1ba851de3a9d8e9dd7ae6730eff39df6f3b2fee47cae2908e3260668e0361ea9bd2ebb68e2a0591e9ec864cd274cc1d8b3a98016c2bcf1d874e57a78b55d2f981aeb6d2c79bfecc9d43236330abbff1afd96b7ffa626bb4936065bb0196c7181b628021dea483cf13a2f044347925f429d5fbc7008162c9cd736b79ca68d62341101204bca0cca1ff22ee54be7fa316d48db768db05dda4f044956926b209e90497a64953e290f7; sailthru_visitor=a057d87e-b51c-4f1e-9167-d146c2a3a7bc' } self.downloadPath = '/data/crawler' self.picPath = '/buzzfeed/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): page = 1 while page < 3: news_list_url = 'https://www.buzzfeednews.com/site-component/v1/en-us/trending-on-buzzfeednews?page={}&page_size=10'.format( page) self.parsing_buzzFeed_news_list(news_list_url=news_list_url) time.sleep(random.uniform(60, 70)) page += 1 def parsing_buzzFeed_news_list(self, news_list_url): res = requests.get(url=news_list_url, headers=self.headers, cookies=self.cookies).text item = json.loads(res) results = item['results'] for i in range(len(results)): jobId = time.time() source = int(3) sourceUrl = results[i]['url'] filter_data = self.filter.filter_data(details_url=sourceUrl) if filter_data: print('Data already exists!') else: title = results[i]['name'] releaseTime = results[i]['created_at'] thumbnail_img = results[i]['image'] article = self.parsing_details_url(details_url=sourceUrl, thumbnail_img=thumbnail_img) if article == '' or article is None: pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': article['authorName'], 'releaseTime': releaseTime, 'content': article['content'], 'img': article['img'] } print('data:\n', data) self.save.save_data(data=data, news='buzzfeed') def parsing_details_url(self, details_url, thumbnail_img): time.sleep(random.uniform(2, 5)) html = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text html_obj = etree.HTML(html) authorName = ''.join( html_obj.xpath( '//span[@class="news-byline-full__name xs-block link-initial--text-black"]/text()' )) content_list = html_obj.xpath( '//div[@data-module="article-wrapper"]//p//text()') content = '<p>'.join([ i.replace("\n", '').strip() for i in content_list ]).replace("<p><p>", '<p>') if content == '' or content is None: pass else: img = self.download_img(html=html_obj, thumbnail_img=thumbnail_img) if img == '' or img is None: return None else: article = { 'authorName': authorName, 'content': content, 'img': img } return article def download_img(self, html, thumbnail_img): pic_list_1 = html.xpath('//figure//img/@data-src') pic_list_2 = html.xpath('//picture//img/@src') pic_list_3 = [i for i in pic_list_1 if i not in pic_list_2] pic_url_list = pic_list_2 + pic_list_3 img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: urllib.request.urlretrieve( thumbnail_img, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img = r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)) return img else: for pic_url in pic_url_list[:17]: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img
class New_York_Post_news(object): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } self.cookies = { 'cookie': 'optimizelyEndUserId=oeu1555918735298r0.6812295616493853; _ga=GA1.2.501461025.1557229334; __pnahc=0; __tbc=%7Bjzx%7DbOREsfUR6SMcRp5niCu4XyJqGIm9xLbU2svbGCB3e5Y-ZlpcwIXRF_gOx5ssrGMlRCzVeeO-JA50xgthIobqDMJS2og0GbDQCa7bPklPxk1yFokaLXVHvRa0s4J7s817Uqt8s09tJ4GcmUzNoGeVhA; __pat=-14400000; __gads=ID=b83698edc796f48a:T=1557229341:S=ALNI_MZYuxiKvMlIMXV92xfTw1XB6ms7EA; __qca=P0-643878854-1557229359497; _gid=GA1.2.1904385410.1557829332; _ncg_g_id_=9d6dce3e-2d15-45b6-a948-9dbb7fa69171; OX_plg=pm; _ncg_id_=16a959d5d8b-5c91174d-2719-4974-99d8-33e86e4219c2; _pc_morningReportRan=true; _sp_ses.3725=*; _parsely_session={%22sid%22:3%2C%22surl%22:%22https://nypost.com/%22%2C%22sref%22:%22%22%2C%22sts%22:1557886391506%2C%22slts%22:1557829347024}; _parsely_visitor={%22id%22:%2236e3895b-5884-4e1e-b290-0b0a1e631850%22%2C%22session_count%22:3%2C%22last_session_ts%22:1557886391506}; AMP_TOKEN=%24NOT_FOUND; bounceClientVisit2045v=N4IgNgDiBcIBYBcEQM4FIDMBBNAmAYnvgHYCeEA9iggHQDGFAtkcQKYDu6BIANCAE4wQZStXpMQAXyA; _ncg_sp_ses.64db=*; _gat=1; __idcontext=eyJjb29raWVJRCI6IlpORUtKQzRVUEwzNUhUUTI2QkNJVUIySDZZVUFOTjI1V0E0VzI1WERQSU9RPT09PSIsImRldmljZUlEIjoiWk5FS0pDNFVPVFU1NzVaQTVZSU1LVlNHUlVFU0pOM0JWNEdTWVJHNUM0WEE9PT09IiwiaXYiOiJQR1ZMV1NQWTc2R0pGMkhISUJCVEpBTEc0UT09PT09PSIsInYiOjF9; __pvi=%7B%22id%22%3A%22v-2019-05-15-11-28-33-472-TCTTCzJ3MeatqGej-a5838d1dc51fc02369a5c570d5bb61d6%22%2C%22domain%22%3A%22.nypost.com%22%2C%22time%22%3A1557891488612%7D; __adblocker=false; xbc=%7Bjzx%7DCTyXA66nwH4u0LSnMj_hrMtwYTk54JF59dLs5o_wp3snMXNdvj2Yy6TBtbRxyGxf14_VW1q5TLlW6vo43sH4bt1xlU681XmGmmXaT-SetcMReVqnxTFjI2gW-7RAeJAQFo8mvk88JA2ghePCorbhbWMs02tfzF_-k1Krwk0Vz5I_4BWDD33FM1fohQjjcgYaPM-1rt-sKsCEnjEZlCFDpqiFO54mgbKUB-kFVcHhi-_WjEFJazS2Vtn_ZZJHi-y44g16CXbGiqpHfoDR9DPafHAts-4n-G65fMRtwt9Ml8JaS73yz78cdU_g515IoAaF5TiHkpwV8OOumbfwBrkq2AU3h3dtbnjKZd070tIlyyZdFCbfpjqxaxax2jiN0PitRuCioMt8p4TO3fxq6ok4tA; _ncg_sp_id.64db=d08b0f08-4e58-40a9-8cd3-63efa5ae79b6.1557229345.5.1557891492.1557886389.183b444b-a79a-4498-bd82-c06c607b176e; _sp_id.3725=b96eefdc3adfd036.1557229358.3.1557891502.1557831911' } self.downloadPath = '/data/crawler' self.picPath = '/nypost/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): pg = 1 while pg < 3: url_dic = { "news": 'https://nypost.com/news/page/{}/'.format(pg), "metro": 'https://nypost.com/metro/page/{}/'.format(pg), "pagesix": 'https://pagesix.com/page/{}/'.format(pg), "basketball": 'https://nypost.com/basketball/page/{}/'.format(pg), "baseball": 'https://nypost.com/baseball/page/{}/'.format(pg), "football": 'https://nypost.com/football/page/{}/'.format(pg), "college": 'https://nypost.com/college/page/{}/'.format(pg), "hockey": 'https://nypost.com/hockey/page/{}/'.format(pg), "business": 'https://nypost.com/business/page/{}/'.format(pg), "opinion": 'https://nypost.com/opinion/page/{}/'.format(pg), "entertainment": 'https://nypost.com/entertainment/page/{}/'.format(pg), "fashion": 'https://nypost.com/fashion/page/{}/'.format(pg), "living": 'https://nypost.com/living/page/{}/'.format(pg), "tech": 'https://nypost.com/tech/page/{}/'.format(pg), } for kw in url_dic: print('keyword:\t', kw) self.parsing_list_page_url(list_url=url_dic[kw]) pg += 1 def parsing_list_page_url(self, list_url): res = requests.get(url=list_url, headers=self.headers, cookies=self.cookies).text html = etree.HTML(res) time.sleep(random.uniform(2, 5)) details_urls_list = html.xpath('//h3[@class="entry-heading"]//a/@href') title_list = html.xpath('//h3[@class="entry-heading"]//a/text()') releaseTime_list = html.xpath('//div[@class="entry-meta"]//p//text()') for i in range(len(details_urls_list)): sourceUrl = details_urls_list[i] title = title_list[i] releaseTime = releaseTime_list[i] source_headers = { 'sourceUrl': sourceUrl, 'title': title, 'releaseTime': releaseTime } self.parsing_details_page_url(source_headers=source_headers, ) def parsing_details_page_url(self, source_headers): status = self.filter.filter_data( details_url=source_headers['sourceUrl']) if status: print('Data already exists!') else: res = requests.get(url=source_headers['sourceUrl'], headers=self.headers, cookies=self.cookies).text html = etree.HTML(res) time.sleep(random.uniform(1, 3)) source = int(10) sourceUrl = source_headers['sourceUrl'] jobId = time.time() title = source_headers['title'] releaseTime = source_headers['releaseTime'] authorName = ''.join(html.xpath('//p[@class="byline"]//a/text()')) content = self.parsing_news_content(content_html=res, html_obj=html, newspaper=True) img = self.download_img(html_obj=html) if img is None or img == '' or content is None or content == '': pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='nowYorkPost') def parsing_news_content(self, content_html=None, html_obj=None, newspaper=False): if newspaper: text = fulltext(content_html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) else: content_list = html_obj.xpath( '//div[@id="news-content"]//p/text()') content = '<p>'.join([ i.replace("\n", '').strip() for i in content_list ]).replace("<p><p>", '<p>') return content def download_img(self, html_obj): pic_list_1 = html_obj.xpath('//div[@class="featured-image"]/img/@src') pic_list_2 = html_obj.xpath( '//div[@class="article-header"]//img/@data-srcset') pic_url_list = pic_list_1 + pic_list_2 img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: return None else: for pic_url in pic_url_list[:17]: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img
class UPROXX_News(): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } self.cookies = { 'cookie': '_ga=GA1.2.1916480018.1557387143; _omappvp=NXTUG1O09XwizTEVPHY0CDatCFaa7zmyENXMZ3yBzNBfpRrUJyJSjzawWbNOtmCk3a0M6l51v1hv01nhoAdHqIQLyxntcGlZ; __gads=ID=967d7ff68a5a2656:T=1557387149:S=ALNI_MZFY5Q_tfI8WS1_30SK817ySI14RQ; _cb_ls=1; _cb=BvCRDMN1EZ-CKUJwA; _scid=12f16568-c9b2-4331-9513-626f26e7aac6; _fbp=fb.1.1558321180103.1335358405; __qca=P0-305115545-1558321179420; _chartbeat2=.1558321174581.1558322228287.1.w0ijvCb8zgbDJfkouB1YhL9BM0Wu2.15; _sctr=1|1559059200000; _gid=GA1.2.676333276.1559707655; _cmpQcif3pcsupported=1; _parsely_visitor={%22id%22:%22f94909f1-8e1d-499d-8590-04e058a8acdf%22%2C%22session_count%22:4%2C%22last_session_ts%22:1559707963140}; _parsely_slot_click={%22url%22:%22https://uproxx.com/dimemag/demarcus-cousins-warriors-game-2-nba-finals-passing-analysis-videos/%22%2C%22x%22:1163%2C%22y%22:0%2C%22xpath%22:%22//*[@id=%5C%22menu-item-1560569%5C%22]/a[1]%22%2C%22href%22:%22https://uproxx.com/news%22}; _threds=1; _thredb=uproxx.76a113a16f1e45e5bf36b23bf05e76a6.1558321178020.1559712981550.1559713181162.30.6; _gat_auPassiveTagger=1; _gat=1' } self.downloadPath = '/data/crawler' self.picPath = '/uproxx/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): pg = 1 while pg < 10: start_url = 'https://uproxx.com/wp-json/wovenis/v1/home/{}?offset=34'.format( 1) self.parsing_news_list_page(url=start_url) pg += 1 def parsing_news_list_page(self, url): res = requests.get(url=url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) js = json.loads(res) html = js['html'] html_obj = etree.HTML(html) url_list = html_obj.xpath('//h2/a/@href') for i in url_list: status = self.filter.filter_data(details_url=i) if status: print('Data already exists!') else: try: self.parsing_details_page(details_url=i) except: pass def parsing_details_page(self, details_url): res = requests.get(url=details_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 2)) html = etree.HTML(res) source = int(13) jobId = time.time() sourceUrl = details_url title = ''.join(html.xpath('//div[@class="post-top"]//h1//text()')) authorName = html.xpath('//span[@class="authorname"]//text()')[0] releaseTime = html.xpath( '//span[@class="published-date uproxx-the-date"]//text()')[0] content = self.analysis_news_content(html=res) img = self.analysis_news_img(html_obj=html) if img is None or img == '' or content is None or content == '': pass else: data = { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } print('data:\n', data) self.save.save_data(data=data, news='UPROXX') def analysis_news_content(self, html): text = fulltext(html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content def analysis_news_img(self, html_obj): pic_url_list = html_obj.xpath('//div[@class="ug_page"]//img/@src') img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: return None else: try: for pic_url in pic_url_list[:17]: response = requests.get(pic_url) image = Image.open(BytesIO(response.content)) image.save(r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None
class CBS_News(): def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' } self.cookies = { 'cookies': 'fly_device=desktop; fly_geo={"countryCode": "cn"}; CBS_INTERNAL=0; _cb_ls=1; _cb=DrObeWDJQRFdCPmQx1; optimizelyEndUserId=oeu1556274100628r0.4116041118910556; __gads=ID=d68306632b854d8c:T=1556274103:S=ALNI_MYpAOeaoN_TEKi9ErEphorJuu4FxA; aam_uuid=38178500434044041890375836043549172921; _v__chartbeat3=DSbaGWCHXxS0C6XCeZ; first_page_today=false; cbsnews_ad=%7B%22type%22%3A%22gpt%22%2C%22region%22%3A%22aw%22%2C%22session%22%3A%22a%22%2C%22subSession%22%3A%223%22%7D; AMCVS_10D31225525FF5790A490D4D%40AdobeOrg=1; s_cc=true; OX_plg=pm; fly_vid=1a29bea6-1a13-4100-a305-ffa9b02166d3; pmtimesig=[[1556347239934,0],[1556350240525,3000591],[1556372772902,22532377]]; s_vnum=1558866104445%26vn%3D10; s_invisit=true; s_lv_undefined_s=Less%20than%201%20day; AMCV_10D31225525FF5790A490D4D%40AdobeOrg=1406116232%7CMCMID%7C37954619966530193010387509759393309121%7CMCAAMLH-1557023341%7C11%7CMCAAMB-1557023341%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1556425741s%7CNONE%7CvVersion%7C2.5.0; trc_cookie_storage=taboola%2520global%253Auser-id%3Dbfc0c49d-bde0-4b78-9484-33cd8cb7509f-tuct3bdf4f1; AAMC_cbsi_0=REGION%7C11%7CAMSYNCSOP%7C%7CAMSYNCS%7C; _cb_svref=null; _t_tests=eyJMdFRUYmdVZHBDcHBKIjp7ImNob3NlblZhcmlhbnQiOiJCIiwic3BlY2lmaWNMb2NhdGlvbiI6WyJEZlhyTVYiXX0sImxpZnRfZXhwIjoibSJ9; cbsn_device=desktop; muxData=mux_viewer_id=a3de65c6-88bd-4042-a748-fb385d2ada3d&msn=0.5261598146217972&sid=11df9f3c-9e4d-47e4-9786-2de0583451e8&sst=1556418792060&sex=1556421954813; GED_PLAYLIST_ACTIVITY=W3sidSI6ImdDTUIiLCJ0c2wiOjE1NTY0MjA0NTUsIm52IjoxLCJ1cHQiOjE1NTY0MjAxNDIsImx0IjoxNTU2NDIwNDU1fV0.; s_sq=%5B%5BB%5D%5D; prevPageType=topic_list; prevPageName=cbsnews:/latest/us/5/; s_getNewRepeat=1556420875652-Repeat; s_lv_undefined=1556420875654; utag_main=v_id:016a592a36a1009f5e955a97097003079001807100bd0$_sn:10$_ss:0$_st:1556422675588$vapi_domain:cbsnews.com$dc_visit:10$_pn:38%3Bexp-session$ses_id:1556418538777%3Bexp-session$dc_event:30%3Bexp-session$dc_region:eu-central-1%3Bexp-session; _chartbeat2=.1556274100027.1556420876067.111.atSntCpXEouDM4RkLBcjI23BVm-lP.40; s_ptc=%2Flatest%2Fus%2F5%2F%5E%5E0.00%5E%5E0.01%5E%5E0.28%5E%5E0.52%5E%5E0.63%5E%5E0.44%5E%5E5.08%5E%5E0.01%5E%5E6.59; RT="sl=38&ss=1556418537489&tt=40674&obo=1&sh=1556420880100%3D38%3A1%3A40674%2C1556420718464%3D37%3A1%3A34088%2C1556420455825%3D36%3A1%3A31715%2C1556420142482%3D35%3A1%3A30988%2C1556420128526%3D34%3A1%3A30943&dm=cbsnews.com&si=91b57407-760b-481b-87e3-bcff31d166db&bcn=%2F%2F173e2514.akstat.io%2F&ld=1556420880100&r=https%3A%2F%2Fwww.cbsnews.com%2Flatest%2Fus%2F5%2F&ul=1556420983930"' } self.downloadPath = '/data/crawler' self.picPath = '/cbs_news/picture/' self.filter = Filter_Data() self.save = Save_Data() def run(self): pg = 1 while pg < 4: health = 'https://www.cbsnews.com/latest/health/{}/'.format(pg) world = 'https://www.cbsnews.com/latest/world/{}/'.format(pg) crime = 'https://www.cbsnews.com/latest/crime/{}/'.format(pg) entertainment = 'https://www.cbsnews.com/latest/entertainment/{}/'.format( pg) science = 'https://www.cbsnews.com/latest/science/{}/'.format(pg) technology = 'https://www.cbsnews.com/latest/technology/{}/'.format( pg) cbs.parsing_health_news_list_page(start_url=health) cbs.parsing_word_news_list_page(start_url=world) cbs.parsing_crime_news_list_page(start_url=crime) cbs.parsing_entertainment_news_list_page(start_url=entertainment) cbs.parsing_science_news_list_page(start_url=science) cbs.parsing_technology_news_list_page(start_url=technology) pg += 1 def parsing_health_news_list_page(self, start_url): res = requests.get(start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) html = etree.HTML(res) list_page_url = html.xpath( '//section[@id="component-health"]//div[@class="component__item-wrapper"]//article//a/@href' ) thumbnail_img = html.xpath( '//section[@id="component-health"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src' ) for i in range(len(list_page_url)): data = self.parsing_details_page(details_url=list_page_url[i], thumbnail_img=thumbnail_img[i]) if data is None: pass else: print('health_data\n', data) self.save.save_data(data=data, news='cbs') def parsing_word_news_list_page(self, start_url): res = requests.get(start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) html = etree.HTML(res) list_page_url = html.xpath( '//section[@id="component-world"]//div[@class="component__item-wrapper"]//article//a/@href' ) thumbnail_img = html.xpath( '//section[@id="component-world"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src' ) for i in range(len(list_page_url)): data = self.parsing_details_page(details_url=list_page_url[i], thumbnail_img=thumbnail_img[i]) if data is None: pass else: print('word_data\n', data) self.save.save_data(data=data, news='cbs') def parsing_crime_news_list_page(self, start_url): res = requests.get(start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) html = etree.HTML(res) list_page_url = html.xpath( '//section[@id="component-crime"]//div[@class="component__item-wrapper"]//article//a/@href' ) thumbnail_img = html.xpath( '//section[@id="component-crime"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src' ) for i in range(len(list_page_url)): data = self.parsing_details_page(details_url=list_page_url[i], thumbnail_img=thumbnail_img[i]) if data is None: pass else: print('crime_data\n', data) self.save.save_data(data=data, news='cbs') def parsing_entertainment_news_list_page(self, start_url): res = requests.get(start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) html = etree.HTML(res) list_page_url = html.xpath( '//section[@id="component-entertainment"]//div[@class="component__item-wrapper"]//article//a/@href' ) thumbnail_img = html.xpath( '//section[@id="component-entertainment"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src' ) for i in range(len(list_page_url)): data = self.parsing_details_page(details_url=list_page_url[i], thumbnail_img=thumbnail_img[i]) if data is None: pass else: print('entertainment_data\n', data) self.save.save_data(data=data, news='cbs') def parsing_science_news_list_page(self, start_url): res = requests.get(start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) html = etree.HTML(res) list_page_url = html.xpath( '//section[@id="component-science"]//div[@class="component__item-wrapper"]//article//a/@href' ) thumbnail_img = html.xpath( '//section[@id="component-science"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src' ) for i in range(len(list_page_url)): data = self.parsing_details_page(details_url=list_page_url[i], thumbnail_img=thumbnail_img[i]) if data is None: pass else: print('science_data\n', data) self.save.save_data(data=data, news='cbs') def parsing_technology_news_list_page(self, start_url): res = requests.get(start_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(3, 5)) html = etree.HTML(res) list_page_url = html.xpath( '//section[@id="component-technology"]//div[@class="component__item-wrapper"]//article//a/@href' ) thumbnail_img = html.xpath( '//section[@id="component-technology"]//div[@class="component__item-wrapper"]//span[@class="img item__thumb item__thumb--crop-0"]//img/@src' ) for i in range(len(list_page_url)): data = self.parsing_details_page(details_url=list_page_url[i], thumbnail_img=thumbnail_img[i]) if data is None: pass else: print('technology_data\n', data) self.save.save_data(data=data, news='cbs') def parsing_details_page(self, details_url, thumbnail_img): result = self.filter.filter_data(details_url=details_url) if result: print('Data already exists!') else: details_res = requests.get(details_url, headers=self.headers, cookies=self.cookies).text time.sleep(random.uniform(1, 3)) html = etree.HTML(details_res) source = int(7) sourceUrl = details_url jobId = time.time() title = ''.join(html.xpath('//h1[@class="content__title"]/text()')) text = fulltext(details_res).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) author = html.xpath( '//p[@class="content__meta content__meta-byline"]/text()') authorName = ''.join( [i.replace("/n", '<p>').strip() for i in author]) releaseTimeList = html.xpath( '//p[@class="content__meta content__meta-timestamp"]/time/text()' ) releaseTime = ''.join( [i.replace("/n", '<p>').strip() for i in releaseTimeList]) pic_url_list = html.xpath( '//span[@class="img embed__content"]//img/@src') img = self.download_pic(pic_url_list=pic_url_list, thumbnail_img=thumbnail_img) if img is None or img == '' or content is None or content == '' or title is None or title == '': pass else: return { 'source': source, 'jobId': int(jobId), 'sourceUrl': sourceUrl, 'title': title, 'authorName': authorName, 'releaseTime': releaseTime, 'content': content, 'img': img } def download_pic(self, pic_url_list, thumbnail_img): try: img_id = str(uuid.uuid4()).replace('-', '') index = 1 img_list = [] if pic_url_list == []: urllib.request.urlretrieve( thumbnail_img, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img = r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index)) return img else: for pic_url in pic_url_list[:17]: urllib.request.urlretrieve( pic_url, r'%s.jpg' % (self.downloadPath + self.picPath + str(img_id) + "-" + str(index))) img_list.append( r'%s.jpg' % (self.picPath + str(img_id) + "-" + str(index))) index += 1 img = ','.join(img_list) return img except: return None