コード例 #1
0
def crawl_radikal(start, numOfPages, categoryID):
    
    name = "radikal"
    rootlink_item = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalDetayV3&ArticleID="
    #rootlink_id = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE="
    #rootlink_id = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID="+str(categoryID)+"&PAGE="
    rootlink_id = "http://www.radikal.com.tr/"+str(cat_radikal[categoryID])+"/tum_haberler-"
    
    #item
    markerTitle1 = 'class="turkiye-tc">'   #'<title>'
    markerTitle2 = '</h1></div>'  #'</title>'
    
    
    '''  eski:
    markerText1 = '<div id="metin2" class="fck_li">'
    markerText2 = '<div class="IndexKeywordsHeader"'    # veya 'id="hiddenTitle"'
    
    idlimit1 = "<div class=\"cat-news\"><ol";
    idlimit2 = "var Geri = 'Geri'";
    
    '''
    
    ''' 10 Ekim itibariyle  '''
    markerText1 = '<div id="metin2">'
    markerText2 = '<div class="social-area clearfix sc-news-bottom">'   #'<div class="article_end"'
    
    markerDate1 = '<span class="date">'  #'<div class="text_size"><span>'   #'<p class="date">'
    markerDate2 = '</span><div class="options">'   #'</span><span>'  #'</p>'
    
    # authors in radikal are inextractable. names are inside text (div id=metin2..)
    markerAuthor1 = '=MuhabirArama&amp;Keyword='
    markerAuthor2 = '</a>'
    
    idlimit1 = "<div class=\"box_z_a\"><div class=\"news mr20\">"  
    idlimit2 = "<div id=\"paging\""
    
    pattern1 = r"_[a-z0-9]+-[0-9]{6,10}"        #r";articleid=[0-9]{6,9}"    #r";ArticleID=[0-9]{6,9}"
    pattern2 = r'[0-9]{6,10}'
    
    
    resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2)
    resource1.setEncoding('iso-8859-9')
    
    #start = 1
    #numOfPages = 2
    rooturl = resource1.rootlink_id
    IDlist = []
    for i in range(start,start+numOfPages):
        url = rooturl + str(i)
        IDlist = IDlist + retrieveNewsIDs(resource1, url)
    
    IDlist = list(set(IDlist))
    categoryName = cat_radikal[categoryID]
    path = resource1.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt"
    IO.todisc_list(IDlist, path)
    
    crawlresourceItems(resource1, IDlist, categoryName)
コード例 #2
0
def crawl_aydinlik(start, numOfPages, categoryID):
    
    name = "aydinlik"
    '''
    rootlink_item = "http://www.aydinlikgazete.com/"+str(cat_aydinlik[categoryID])+ ".html"
    rootlink_id = "http://www.radikal.com.tr/"+str(cat_radikal[categoryID])+"/"
    '''
    
    rootlink_item = "http://www.aydinlikgazete.com/"+str(cat_aydinlik[categoryID])+"/"
    rootlink_id = "http://www.aydinlikgazete.com/"+str(cat_aydinlik[categoryID])+".html?start="   #i*10
    
    # http://www.aydinlikgazete.com/ekonomi/30774.html
    # http://www.aydinlikgazete.com/ekonomi.html?start=240
    
    #item
    markerTitle1 = '<title>'   #'<title>'
    markerTitle2 = '</title>'  #'</title>'
    
    markerText1 = '<p class="articleinfo">'
    markerText2 = '<span class="article_separator">'   #'<div class="article_end"'
    
    markerDate1 = '<span class="createdate">'  #'<div class="text_size"><span>'   #'<p class="date">'
    markerDate2 = '</span>'   #'</span><span>'  #'</p>'
    
    # authors in radikal are inextractable. names are inside text (div id=metin2..)
    markerAuthor1 = '=MuhabirArama&amp;Keyword='
    markerAuthor2 = '</a>'
    
    idlimit1 = '<table class="category">'
    idlimit2 = '<ul class="pagination">'
    
    pattern1 = r'\/[a-z0-9]+\/[0-9]{3,9}-'       
    pattern2 = r'[0-9]{3,9}'
    
    
    resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2)
    #resource1.setEncoding('iso-8859-9')
    
    #start = 1
    #numOfPages = 2
    rooturl = resource1.rootlink_id
    IDlist = []
    for i in range(start,start+numOfPages):
        url = rooturl + str(i*10)
        IDlist = IDlist + retrieveNewsIDs(resource1, url)
    
    IDlist = list(set(IDlist))
    categoryName = cat_aydinlik[categoryID]
    path = resource1.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt"
    IO.todisc_list(IDlist, path)
    
    crawlresourceItems(resource1, IDlist, categoryName)
コード例 #3
0
def crawl_solhaber(start, numOfPages, categoryname):
    
    name = "solhaber"
    rootlink_item = "http://haber.sol.org.tr/"+categoryname+"/"
    # http://haber.sol.org.tr/sonuncu-kavga/tupras-ve-petrol-is-anlasti-petrol-iscisine-yuzde-82-zam-haberi-72210
 
    rootlink_id = "http://haber.sol.org.tr/"+categoryname+"?page="
    # http://haber.sol.org.tr/sonuncu-kavga?page=29
    
    
    #item
    markerTitle1 = '<h2 class="title node-title">'   #'<title>'
    markerTitle2 = '</h2>'  #'</title>'
    
    
    markerText1 = '<div class="makale-govde">'
    markerText2 = '<div id="social-links">'   #'<div class="article_end"'
    
    markerDate1 = '<div class="node-date">'  #'<div class="text_size"><span>'   #'<p class="date">'
    markerDate2 = '</div>'   #'</span><span>'  #'</p>'
    
    # authors in radikal are inextractable. names are inside text (div id=metin2..)
    markerAuthor1 = '=MuhabirArama&amp;Keyword='
    markerAuthor2 = '</a>'
    
    idlimit1 = '<div id="block-views-f_kategori_manset-block_2"'
    idlimit2 = '<ul class="pager">'
    
    pattern1 = r'<a href=\".+[0-9]{3,9}\">'      #r";articleid=[0-9]{6,9}"    #r";ArticleID=[0-9]{6,9}"
    pattern2 = r'\/[a-z0-9-]+-[0-9]{3,9}'
        
    resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2)
    #resource1.setEncoding('iso-8859-9')
    
    #start = 1
    #numOfPages = 2
    rooturl = resource1.rootlink_id
    IDlist = []
    for i in range(start,start+numOfPages):
        url = rooturl + str(i)
        IDlist = IDlist + retrieveNewsIDs_sol(resource1, url)
    
    IDlist = list(set(IDlist))
    path = resource1.newsidpath+categoryname+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt"
    IO.todisc_list(IDlist, path)
    
    crawlresourceItems(resource1, IDlist, categoryname)
コード例 #4
0
def crawl_habervaktim(start, numOfPages, categoryID):
    name = "vakit"
    catname = cat_habervaktim[categoryID]
    rootlink_item = "http://www.habervaktim.com/haber/"   # ex. http://www.habervaktim.com/haber/316875/
    rootlink_id = "http://www.habervaktim.com/"+ catname +"-haberleri-"+ str(categoryID) +"hk-p"    # ex. http://www.habervaktim.com/siyaset-haberleri-4hk-p5.htm
    
    #item
    markerTitle1 = '<div class="title"><h1>'
    markerTitle2 = '</h1></div>'
    
    markerText1 = 'class="text_content">'
    markerText2 = 'changeTarget("#news_content")'
    
    markerDate1 = '<div class="date">'
    markerDate2 = '<div id="news_content"'
    
    markerAuthor1 = ""
    markerAuthor2 = ""
    
    idlimit1 = '<div class="news"><div class="box_news box_news_1">'
    idlimit2 = '<div class="hor_seperator">'
    
    pattern1 = r"/haber/[0-9]{6,9}"
    pattern2 = r'[0-9]{6,9}'    
    
    resource1 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2)
    resource1.setEncoding('UTF-8')
    
    rooturl = resource1.rootlink_id
    IDlist = []
    for i in range(start,start+numOfPages):
        url = rooturl + str(i) + ".htm"
        IDlist = IDlist + retrieveNewsIDs(resource1, url)
    
    IDlist = list(set(IDlist))
    categoryName = cat_habervaktim[categoryID]
    path = resource1.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt"
    IO.todisc_list(IDlist, path)
    
    crawlresourceItems(resource1, IDlist, categoryName)
コード例 #5
0
def crawl_yenisafak(start, numOfPages):   
    name = "yenisafak"
    rootlink_item = "http://www.yenisafak.com.tr/Dunya/?i="
    rootlink_id = "http://www.yenisafak.com.tr/Dunya/"    # http://www.yenisafak.com.tr/Dunya/?t=dd.mm.yyyy
    
    
    #item
    markerTitle1 = '<title>'
    markerTitle2 = '</title>'
    
    markerText1 = 'class="haberdetaymetin">'                   
    markerText2 = 'class="haberdetaytarih"'    
    
    #id
    idlimit1 = '<div class="haberdetaydiger2';
    idlimit2 = "<div class=\"mngalerivideo";
    
    pattern1 = r"\&i=[0-9]{5,9}"
    pattern2 = r'[0-9]{5,9}'
    
    resource3 = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2)
    
    
    
    # get id list
    
    #start = 0
    #numOfPages = 2
    rooturl = resource3.rootlink_id
    IDlist = []
    #for one day:
    IDlist = IDlist + retrieveNewsIDs(resource3, rooturl)
    
    IDlist = list(set(IDlist))
    path = resource3.newsidpath+"dunya_newsIDs"+str(start)+"-"+str(numOfPages)+".txt"
    IO.todisc_list(IDlist, path)   
        
    
    crawlresourceItems(resource3, IDlist)
コード例 #6
0
def crawl_cumhuriyet(start, numOfPages, categoryID):
    
    name = "cumhuriyet"
    rootlink_item = "http://www.cumhuriyet.com.tr/?hn="
    rootlink_id = "http://www.cumhuriyet.com.tr/?kn="+ str(categoryID) + "&ilk="
    
    
    
    '''  eski
    markerText1 = 'data-text="'                   #'class="twitter-share-button"'     #'<span class="mahrec">'
    markerText2 = '<p class="tarih">'    # veya 'id="hiddenTitle"'
    '''
    
    #item
    markerTitle1 = '<span class="ehaberBaslik">'
    markerTitle2 = '</span></h1>'
    
    markerText1 = 'class="mahrec"'                   #'class="twitter-share-button"'     #'<span class="mahrec">'
    markerText2 = '<p class="tarih">'    # veya 'id="hiddenTitle"'
    
    
    
    
    '''  11 Ekim itibariyle '''
    
    markerDate1 = '<p class="tarih">'
    markerDate2 = '</p>'      #'<form id="FORMyildiz">'
    
    markerAuthor1 = '?yer=yazar&amp;aranan='
    markerAuthor2 = '">'
    
    
    #id
    idlimit1 = '<div class="s1st1_1">'
    idlimit2 = '<div class="s1st1_2">'
    
    pattern1 = r"\?hn=[0-9]{5,9}"
    pattern2 = r'[0-9]{5,9}'
    
    
    resource = NewsResource(name, rootlink_id, rootlink_item, idlimit1, idlimit2, pattern1, pattern2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1, markerDate2, markerAuthor1, markerAuthor2)
    resource.setEncoding('iso-8859-9')
    
    
    
    
    # get id list
    
    #start = 0
    #numOfPages = 2
    rooturl = resource.rootlink_id
    IDlist = []
    for i in range(start,start+numOfPages):
        url = rooturl + str(i*15)
        print url
        IDlist = IDlist + retrieveNewsIDs(resource, url)
    
    IDlist = list(set(IDlist))
    categoryName = cat_cumhuriyet[categoryID]
    path = resource.newsidpath+categoryName+"_newsIDs"+str(start)+"-"+str(numOfPages)+".txt"
    IO.todisc_list(IDlist, path)
    
    crawlresourceItems(resource, IDlist, categoryName)
コード例 #7
0
            #crawl_radikal(start, numOfPages, cid)
            crawl_habervaktim(start, numOfPages, cid)
            start += 2*i
            #crawl_cumhuriyet(start, numOfPages, cid)
    '''


''' 
NOTLAR
id retrieval icin date'e gore range belirleme kismi eksik. Bu haliyle en guncel, son iki veya bir gunun haber id'leri cekiliyor
'''

'''   start with initializing radikal as a NewsResource object   '''


'''
name = "radikal"
rootlink_item = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalDetayV3&ArticleID="
rootlink_id = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE="


#item
markerTitle1 = '<title>'
markerTitle2 = '</title>'

markerText1 = '<div id="metin2" class="fck_li">'
markerText2 = '<div class="IndexKeywordsHeader"'    # veya 'id="hiddenTitle"'


#id
idlimit1 = "<div class=\"cat-news\"><ol";
コード例 #8
0
url = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE=1"
IDlist = retrieveNewsIDs(url)
print "List: ",IDlist


start = 1
numOfPages = 2
rooturl = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalKategoriTumuV3&CategoryID=81&PAGE="
IDlist = []
for i in range(start,start+numOfPages):
    url = rooturl + str(i)
    IDlist = IDlist + retrieveNewsIDs(url)

IDlist = list(set(IDlist))
path = IO.IDlistPath+os.sep+"dunya_newsIDs"+str(start)+"-"+str(numOfPages)+".txt"
IO.todisc_list(IDlist, path)



rootlink_news = "http://www.radikal.com.tr/Radikal.aspx?aType=RadikalDetayV3&ArticleID="
for newsid in IDlist:
    newslink = rootlink_news + str(newsid)
    extraction = getnewsitem2(newslink)
    extraction.toConsole()



'''
rawhtml = readhtml(url)