def crawl_radikal(start, numOfPages, categoryID): name = "radikal" rootlink_item = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalDetayV3&ArticleID=" #rootlink_id = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE=" #rootlink_id = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID="+str(categoryID)+"&PAGE=" rootlink_id = "http://www.radikal.com.tr/"+str(cat_radikal[categoryID])+"/tum_haberler-" #item markerTitle1 = 'class="turkiye-tc">' #'<title>' markerTitle2 = '</h1></div>' #'</title>' ''' eski: markerText1 = '<div id="metin2" class="fck_li">' markerText2 = '<div class="IndexKeywordsHeader"' # veya 'id="hiddenTitle"' idlimit1 = "<div class=\"cat-news\"><ol"; idlimit2 = "var Geri = 'Geri'"; ''' ''' 10 Ekim itibariyle ''' markerText1 = '<div id="metin2">' markerText2 = '<div class="social-area clearfix sc-news-bottom">' #'<div class="article_end"' markerDate1 = '<span class="date">' #'<div class="text_size"><span>' #'<p class="date">' markerDate2 = '</span><div class="options">' #'</span><span>' #'</p>' # authors in radikal are inextractable. names are inside text (div id=metin2..) markerAuthor1 = '=MuhabirArama&Keyword=' markerAuthor2 = '</a>' idlimit1 = "<div class=\"box_z_a\"><div class=\"news mr20\">" idlimit2 = "<div id=\"paging\"" pattern1 = r"_[a-z0-9]+-[0-9]{6,10}" #r";articleid=[0-9]{6,9}" #r";ArticleID=[0-9]{6,9}" pattern2 = r'[0-9]{6,10}' resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2) resource1.setEncoding('iso-8859-9') #start = 1 #numOfPages = 2 rooturl = resource1.rootlink_id IDlist = [] for i in range(start,start+numOfPages): url = rooturl + str(i) IDlist = IDlist + retrieveNewsIDs(resource1, url) IDlist = list(set(IDlist)) categoryName = cat_radikal[categoryID] path = resource1.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt" IO.todisc_list(IDlist, path) crawlresourceItems(resource1, IDlist, categoryName)
def crawl_aydinlik(start, numOfPages, categoryID): name = "aydinlik" ''' rootlink_item = "http://www.aydinlikgazete.com/"+str(cat_aydinlik[categoryID])+ ".html" rootlink_id = "http://www.radikal.com.tr/"+str(cat_radikal[categoryID])+"/" ''' rootlink_item = "http://www.aydinlikgazete.com/"+str(cat_aydinlik[categoryID])+"/" rootlink_id = "http://www.aydinlikgazete.com/"+str(cat_aydinlik[categoryID])+".html?start=" #i*10 # http://www.aydinlikgazete.com/ekonomi/30774.html # http://www.aydinlikgazete.com/ekonomi.html?start=240 #item markerTitle1 = '<title>' #'<title>' markerTitle2 = '</title>' #'</title>' markerText1 = '<p class="articleinfo">' markerText2 = '<span class="article_separator">' #'<div class="article_end"' markerDate1 = '<span class="createdate">' #'<div class="text_size"><span>' #'<p class="date">' markerDate2 = '</span>' #'</span><span>' #'</p>' # authors in radikal are inextractable. names are inside text (div id=metin2..) markerAuthor1 = '=MuhabirArama&Keyword=' markerAuthor2 = '</a>' idlimit1 = '<table class="category">' idlimit2 = '<ul class="pagination">' pattern1 = r'\/[a-z0-9]+\/[0-9]{3,9}-' pattern2 = r'[0-9]{3,9}' resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2) #resource1.setEncoding('iso-8859-9') #start = 1 #numOfPages = 2 rooturl = resource1.rootlink_id IDlist = [] for i in range(start,start+numOfPages): url = rooturl + str(i*10) IDlist = IDlist + retrieveNewsIDs(resource1, url) IDlist = list(set(IDlist)) categoryName = cat_aydinlik[categoryID] path = resource1.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt" IO.todisc_list(IDlist, path) crawlresourceItems(resource1, IDlist, categoryName)
def crawl_solhaber(start, numOfPages, categoryname): name = "solhaber" rootlink_item = "http://haber.sol.org.tr/"+categoryname+"/" # http://haber.sol.org.tr/sonuncu-kavga/tupras-ve-petrol-is-anlasti-petrol-iscisine-yuzde-82-zam-haberi-72210 rootlink_id = "http://haber.sol.org.tr/"+categoryname+"?page=" # http://haber.sol.org.tr/sonuncu-kavga?page=29 #item markerTitle1 = '<h2 class="title node-title">' #'<title>' markerTitle2 = '</h2>' #'</title>' markerText1 = '<div class="makale-govde">' markerText2 = '<div id="social-links">' #'<div class="article_end"' markerDate1 = '<div class="node-date">' #'<div class="text_size"><span>' #'<p class="date">' markerDate2 = '</div>' #'</span><span>' #'</p>' # authors in radikal are inextractable. names are inside text (div id=metin2..) markerAuthor1 = '=MuhabirArama&Keyword=' markerAuthor2 = '</a>' idlimit1 = '<div id="block-views-f_kategori_manset-block_2"' idlimit2 = '<ul class="pager">' pattern1 = r'<a href=\".+[0-9]{3,9}\">' #r";articleid=[0-9]{6,9}" #r";ArticleID=[0-9]{6,9}" pattern2 = r'\/[a-z0-9-]+-[0-9]{3,9}' resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2) #resource1.setEncoding('iso-8859-9') #start = 1 #numOfPages = 2 rooturl = resource1.rootlink_id IDlist = [] for i in range(start,start+numOfPages): url = rooturl + str(i) IDlist = IDlist + retrieveNewsIDs_sol(resource1, url) IDlist = list(set(IDlist)) path = resource1.newsidpath+categoryname+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt" IO.todisc_list(IDlist, path) crawlresourceItems(resource1, IDlist, categoryname)
def crawl_habervaktim(start, numOfPages, categoryID): name = "vakit" catname = cat_habervaktim[categoryID] rootlink_item = "http://www.habervaktim.com/haber/" # ex. http://www.habervaktim.com/haber/316875/ rootlink_id = "http://www.habervaktim.com/"+ catname +"-haberleri-"+ str(categoryID) +"hk-p" # ex. http://www.habervaktim.com/siyaset-haberleri-4hk-p5.htm #item markerTitle1 = '<div class="title"><h1>' markerTitle2 = '</h1></div>' markerText1 = 'class="text_content">' markerText2 = 'changeTarget("#news_content")' markerDate1 = '<div class="date">' markerDate2 = '<div id="news_content"' markerAuthor1 = "" markerAuthor2 = "" idlimit1 = '<div class="news"><div class="box_news box_news_1">' idlimit2 = '<div class="hor_seperator">' pattern1 = r"/haber/[0-9]{6,9}" pattern2 = r'[0-9]{6,9}' resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2) resource1.setEncoding('UTF-8') rooturl = resource1.rootlink_id IDlist = [] for i in range(start,start+numOfPages): url = rooturl + str(i) + ".htm" IDlist = IDlist + retrieveNewsIDs(resource1, url) IDlist = list(set(IDlist)) categoryName = cat_habervaktim[categoryID] path = resource1.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt" IO.todisc_list(IDlist, path) crawlresourceItems(resource1, IDlist, categoryName)
def crawl_yenisafak(start, numOfPages): name = "yenisafak" rootlink_item = "http://www.yenisafak.com.tr/Dunya/?i=" rootlink_id = "http://www.yenisafak.com.tr/Dunya/" # http://www.yenisafak.com.tr/Dunya/?t=dd.mm.yyyy #item markerTitle1 = '<title>' markerTitle2 = '</title>' markerText1 = 'class="haberdetaymetin">' markerText2 = 'class="haberdetaytarih"' #id idlimit1 = '<div class="haberdetaydiger2'; idlimit2 = "<div class=\"mngalerivideo"; pattern1 = r"\&i=[0-9]{5,9}" pattern2 = r'[0-9]{5,9}' resource3 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2) # get id list #start = 0 #numOfPages = 2 rooturl = resource3.rootlink_id IDlist = [] #for one day: IDlist = IDlist + retrieveNewsIDs(resource3, rooturl) IDlist = list(set(IDlist)) path = resource3.newsidpath+"dunya_newsIDs"+str(start)+"-"+str(numOfPages)+".txt" IO.todisc_list(IDlist, path) crawlresourceItems(resource3, IDlist)
def crawl_cumhuriyet(start, numOfPages, categoryID): name = "cumhuriyet" rootlink_item = "http://www.cumhuriyet.com.tr/?hn=" rootlink_id = "http://www.cumhuriyet.com.tr/?kn="+ str(categoryID) + "&ilk=" ''' eski markerText1 = 'data-text="' #'class="twitter-share-button"' #'<span class="mahrec">' markerText2 = '<p class="tarih">' # veya 'id="hiddenTitle"' ''' #item markerTitle1 = '<span class="ehaberBaslik">' markerTitle2 = '</span></h1>' markerText1 = 'class="mahrec"' #'class="twitter-share-button"' #'<span class="mahrec">' markerText2 = '<p class="tarih">' # veya 'id="hiddenTitle"' ''' 11 Ekim itibariyle ''' markerDate1 = '<p class="tarih">' markerDate2 = '</p>' #'<form id="FORMyildiz">' markerAuthor1 = '?yer=yazar&aranan=' markerAuthor2 = '">' #id idlimit1 = '<div class="s1st1_1">' idlimit2 = '<div class="s1st1_2">' pattern1 = r"\?hn=[0-9]{5,9}" pattern2 = r'[0-9]{5,9}' resource = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2) resource.setEncoding('iso-8859-9') # get id list #start = 0 #numOfPages = 2 rooturl = resource.rootlink_id IDlist = [] for i in range(start,start+numOfPages): url = rooturl + str(i*15) print url IDlist = IDlist + retrieveNewsIDs(resource, url) IDlist = list(set(IDlist)) categoryName = cat_cumhuriyet[categoryID] path = resource.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt" IO.todisc_list(IDlist, path) crawlresourceItems(resource, IDlist, categoryName)
#crawl_radikal(start, numOfPages, cid) crawl_habervaktim(start, numOfPages, cid) start += 2*i #crawl_cumhuriyet(start, numOfPages, cid) ''' ''' NOTLAR id retrieval icin date'e gore range belirleme kismi eksik. Bu haliyle en guncel, son iki veya bir gunun haber id'leri cekiliyor ''' ''' start with initializing radikal as a NewsResource object ''' ''' name = "radikal" rootlink_item = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalDetayV3&ArticleID=" rootlink_id = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE=" #item markerTitle1 = '<title>' markerTitle2 = '</title>' markerText1 = '<div id="metin2" class="fck_li">' markerText2 = '<div class="IndexKeywordsHeader"' # veya 'id="hiddenTitle"' #id idlimit1 = "<div class=\"cat-news\"><ol";
url = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE=1" IDlist = retrieveNewsIDs(url) print "List: ",IDlist start = 1 numOfPages = 2 rooturl = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE=" IDlist = [] for i in range(start,start+numOfPages): url = rooturl + str(i) IDlist = IDlist + retrieveNewsIDs(url) IDlist = list(set(IDlist)) path = IO.IDlistPath+os.sep+"dunya_newsIDs"+str(start)+"-"+str(numOfPages)+".txt" IO.todisc_list(IDlist, path) rootlink_news = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalDetayV3&ArticleID=" for newsid in IDlist: newslink = rootlink_news + str(newsid) extraction = getnewsitem2(newslink) extraction.toConsole() ''' rawhtml = readhtml(url)