def __init__(self, name, rootlink_id, rootlink_item, idlimit1, idlimit2, idpatternInLink1, idpatternInLink2, markerTitle1, markerTitle2, markerText1, markerText2, markerDate1="", markerDate2="", markerAuthor1="", markerAuthor2=""): self.name = name self.rootlink_id = rootlink_id self.rootlink_item = rootlink_item self.newsidpath = IO.IDlistPath+os.sep+self.name+os.sep self.newsidpath = IO.ensure_dir(self.newsidpath) self.idlimit1 = idlimit1 self.idlimit2 = idlimit2 self.markerTitle1 = markerTitle1 self.markerTitle2 = markerTitle2 self.markerText1 = markerText1 self.markerText2 = markerText2 self.idpatternInLink1 = idpatternInLink1 self.idpatternInLink2 = idpatternInLink2 self.markerDate1 = markerDate1 self.markerDate2 = markerDate2 self.markerAuthor1 = markerAuthor1 self.markerAuthor2 = markerAuthor2
def crawlresourceItems(resource, IDlist, categoryname): path = IO.ensure_dir(IO.itemsPath+os.sep+resource.name+os.sep+categoryname+os.sep) rootlink_news = resource.rootlink_item # update IDlist. remove id's if they are in resource/categoryname crawledIDs = IO.getfilenames_of_dir(path, removeextension=True) IDlist = [newsid for newsid in IDlist if newsid not in crawledIDs] for newsid in IDlist: newslink = rootlink_news + str(newsid) if resource.name == "vakit": newslink += "/" print newslink extraction = getnewsitem(resource, newslink, newsid) if extraction: extraction.setcategory(categoryname) time.sleep(random.choice(range(3,10))) #time.sleep(20) #extraction.toConsole() extraction.toDisc(path)