def main(): while (True): # get the file queue for this run (this will lock the queue until it finishes) feedQueue = FileQueue(crawling_config.DIR_QUEUE, crawling_config.DIR_LOCKS, 'r') links = feedQueue.getItems() # goes through links for linkRaw in links: linkParts = linkRaw.split('|') if (len(linkParts) != 2): # invalid! continue link = linkParts[0] section = linkParts[1] linkFile = getLinkFile(link) if (fileExists(linkFile) == False): # the link is not fetched, we will do it now crawling_config.DEBUG('Fetching %s.' % link) fetchLink(link, linkFile, section) feedQueue.delete() # count queue files # do not wait if there are 3+ queue files... queueFiles = glob.glob(crawling_config.DIR_QUEUE + '*') if (len(queueFiles) < 3): crawling_config.DEBUG('Sleeping for %d seconds' % crawling_config.SLEEP_TIME) time.sleep(crawling_config.SLEEP_TIME) else: print 'There are %d queue files, continue immediately!' % (len(queueFiles)) print 'Bye bye'
def main(): global FLAG_SLEEPING # will be set True if sleeping # get the list of feeds from the 'feeds' directory in the same directory with this script feedLists = glob.glob(crawling_config.DIR_FEED_LISTS + '*') while (True): # get the file queue for this run (this will lock the queue until it finishes) feedQueue = FileQueue(crawling_config.DIR_QUEUE, crawling_config.DIR_LOCKS) saved = 0 # goes through feeds for feedList in feedLists: feedLock = FileLock(crawling_config.DIR_LOCKS + os.path.basename(feedList)) if (feedLock.isLocked() == False): # the feed is not locked atm, we may proceed... # but first, we have to lock it feedLock.lock() try: feedFile = open(feedList, 'r') crawling_config.DEBUG('Processing %s' % feedList) feedMode = feedFile.readline().strip() feedModeConfig = feedFile.readline().strip() feedUrls = feedFile.read().strip().split('\n') feedFile.close() for feedUrl in feedUrls: feedUrlParts = feedUrl.split(' ') links = readFeed(feedUrlParts[0], feedMode, feedModeConfig, feedUrlParts[1]) saved += feedQueue.saveList(links) except IOError: pass # removes the lock for other sessions feedLock.unlock() else: # oops, it's locked atm crawling_config.DEBUG('Bypassed %s because it is being locked.' % feedList) # frees the file queue feedQueue.close() crawling_config.DEBUG('Saved %d links to queue.' % saved) # checks if SIGINT is sent to this script if (FLAG_EXIT_NOW): print 'Stopping now!' break FLAG_SLEEPING = True crawling_config.DEBUG('Sleeping for %d seconds' % crawling_config.SLEEP_TIME) time.sleep(crawling_config.SLEEP_TIME) FLAG_SLEEPING = False print 'Bye bye'