コード例 #1
0
 def __init__(self, name, rootlink_id, rootlink_item, idlimit1, idlimit2, idpatternInLink1, idpatternInLink2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1="", markerDate2="", markerAuthor1="", markerAuthor2=""):
     self.name = name
     self.rootlink_id = rootlink_id
     self.rootlink_item = rootlink_item
     self.newsidpath = IO.IDlistPath+os.sep+self.name+os.sep
     self.newsidpath = IO.ensure_dir(self.newsidpath)
     self.idlimit1 = idlimit1
     self.idlimit2 = idlimit2
     self.markerTitle1 = markerTitle1
     self.markerTitle2 = markerTitle2
     self.markerText1 = markerText1
     self.markerText2 = markerText2
     self.idpatternInLink1 = idpatternInLink1
     self.idpatternInLink2 = idpatternInLink2
     self.markerDate1 = markerDate1
     self.markerDate2 = markerDate2
     self.markerAuthor1 = markerAuthor1
     self.markerAuthor2 = markerAuthor2
コード例 #2
0
def crawlresourceItems(resource, IDlist, categoryname):
    
    path = IO.ensure_dir(IO.itemsPath+os.sep+resource.name+os.sep+categoryname+os.sep)
    rootlink_news = resource.rootlink_item
    
    # update IDlist. remove id's if they are in resource/categoryname
    crawledIDs = IO.getfilenames_of_dir(path, removeextension=True)
    IDlist = [newsid for newsid in IDlist if newsid not in crawledIDs]
    
    for newsid in IDlist:
        newslink = rootlink_news + str(newsid)
        if resource.name == "vakit":
            newslink += "/"
        print newslink
        extraction = getnewsitem(resource, newslink, newsid)
        if extraction:
            extraction.setcategory(categoryname)
            time.sleep(random.choice(range(3,10)))    #time.sleep(20)
            #extraction.toConsole()
            extraction.toDisc(path)