def __init__(self): self.q = persistqueue.UniqueQ(path='q') self.rate = 1.0 auth = tweepy.OAuthHandler(settings.CONSUMER_KEY, settings.CONSUMER_SECRET) auth.set_access_token(settings.ACCESS_TOKEN, settings.ACCESS_TOKEN_SECRET) self.api = tweepy.API(auth) self.sleep_time = 60.0
def populate_queue(self): """ Reload the entries from the filesnames database every 5 seconds, and monitor file changes by comparing the modified time in the database with that of the current modification time. Trigger a re-upload on modification time mismatch """ entries = self.reload_files_to_watch() for e in entries: if not os.path.isfile(e): continue if entries[e]["mtime"] < os.stat(e).st_mtime: logging.info( f"{e} changed. It will be queued for re-uploading") reprocess_queue = persistqueue.UniqueQ(self.queue_location) reprocess_queue.put(e)
def process_queue(self): """ Pop queue and send it to the stash() click command for processeing """ logging.info("Starting queue processor") reprocess_queue = persistqueue.UniqueQ(self.queue_location) i = 0 while i < reprocess_queue.size: logging.info(reprocess_queue.size) this_file = reprocess_queue.get() logging.info( f"{this_file} was received for the queue. It is now being sent for processing" ) stash.callback(None, None, None, None, True, this_file) i += 1
import persistqueue import random mqueue = persistqueue.UniqueQ("random", multithreading=True) def getrandomstring(length=10): mstring = "abcdefghijklmnopqrstuvqxyz123456789" return "".join([mstring[random.randint(0, 34)] for x in range(length)]) def populatequeue(numentries=100): for i in range(numentries): mqueue.put(getrandomstring()) def getvalue(): return mqueue.get() def getvalues(numvalues): return [getvalue() for x in range(numvalues)]
def getUsersToTreat(): users = persistqueue.UniqueQ( '/warehouse/COMPLEXNET/ltarrade/ressources/users_persistQueue', auto_commit=True) return users
def repec_scraper(db_session, cache, seed_handles, max_links=100, persist_at=settings.CACHE_LOCATION): if not persist_at.endswith("/"): persist_at = persist_at + "/" # Initialize Queue object to store RePEC handles; fill it with seed handles. repec_queue = persistqueue.UniqueQ(persist_at + "scraper_queue", auto_commit=True) # Step 1: Check if articles db is empty. If it is, then we need to make sure that the # scraping queue is empty too. citation_chain_count = db_session.query(db.CitationChain).count() citation_db_empty = citation_chain_count == 0 # Clear the shelf and queue if citation_db_empty: logging.warning("Citations table is empty, so I'm clearing the repec_queue shelf") while not repec_queue.empty(): repec_queue.get(timeout=0) # Initiate counter for article entries and link count link_count = len(repec_queue) + citation_chain_count # Add seed handles to the queue if they haven't been visited previously logging.info("Adding seed handles to queue...") for seed_handle in seed_handles: existing_entry = db_session.query(db.Article).filter_by(handle=seed_handle).first() # Because these articles are at the 'root' of the citation chain, the chain list is empty if existing_entry is None: repec_queue.put(ArticleInfo(seed_handle, [])) link_count += 1 # Spider through the queue while not repec_queue.empty(): current = repec_queue.get(timeout=0) logging.info("Current queue length now: " + str(len(repec_queue))) existing_entry = db_session.query(db.Article).filter_by(handle=current.handle).scalar() if existing_entry is None: try: # Download RePEC data and add current counter value as article ID logging.info("Getting RePEC data for " + current.handle) article_info = get_repec_data(cache, current.handle) write_article(article_info, db_session) latest_article_id = db.latest_article_id(db_session) # Add current citation chain to db updated_citation_chain = current.citation_chain + [latest_article_id] write_citation_chain(latest_article_id, updated_citation_chain, db_session) # If we are below max_links, then get citec cites and add them to the queue if link_count < max_links: logging.info("Getting cites for " + current.handle) cites = get_citec_cites(cache, current.handle) for handle in cites: # Second part takes current citation chain and appends current link counter onto it: # e.g., [1,2] -> [1,2,3]. to_put = ArticleInfo(handle, current.citation_chain + [latest_article_id]) repec_queue.put(to_put) link_count += 1 logging.info("Current value of link_count : " + str(link_count)) if link_count > max_links: break else: logging.info("No room left in queue; skipping cites for " + current.handle) except AttributeError: logging.warning("No RePeC data for " + current.handle) except json.decoder.JSONDecodeError: logging.error("Problem decoding JSON for " + current.handle + ". Skipping this one.") except NoDataException: logging.warning("CitEc data missing for " + current.handle) else: # If the handle is already in the database, then we need to add the citation chain again. # However, we need to verify that the citation chain doesn't form a cycle, as this would lead # the scraper to follow an endless loop. updated_citation_chain = current.citation_chain + [existing_entry.id] if existing_entry.id not in current.citation_chain: write_citation_chain(existing_entry.id, updated_citation_chain, db_session) else: logging.warning("Potential cycle detected at" + str(updated_citation_chain) + " Skipping " + current.handle)