コード例 #1
0
ファイル: traverse.py プロジェクト: getumen/twitter_graph
 def __init__(self):
     self.q = persistqueue.UniqueQ(path='q')
     self.rate = 1.0
     auth = tweepy.OAuthHandler(settings.CONSUMER_KEY,
                                settings.CONSUMER_SECRET)
     auth.set_access_token(settings.ACCESS_TOKEN,
                           settings.ACCESS_TOKEN_SECRET)
     self.api = tweepy.API(auth)
     self.sleep_time = 60.0
コード例 #2
0
    def populate_queue(self):
        """
        Reload the entries from the filesnames database every 5 seconds, and monitor file changes by
        comparing the modified time in the database with that of the current modification time. Trigger
        a re-upload on modification time mismatch
        """

        entries = self.reload_files_to_watch()
        for e in entries:
            if not os.path.isfile(e):
                continue
            if entries[e]["mtime"] < os.stat(e).st_mtime:
                logging.info(
                    f"{e} changed. It will be queued for re-uploading")
                reprocess_queue = persistqueue.UniqueQ(self.queue_location)
                reprocess_queue.put(e)
コード例 #3
0
    def process_queue(self):
        """ Pop queue and send it to the stash() click command for processeing """

        logging.info("Starting queue processor")

        reprocess_queue = persistqueue.UniqueQ(self.queue_location)

        i = 0
        while i < reprocess_queue.size:
            logging.info(reprocess_queue.size)
            this_file = reprocess_queue.get()
            logging.info(
                f"{this_file} was received for the queue. It is now being sent for processing"
            )
            stash.callback(None, None, None, None, True, this_file)
            i += 1
コード例 #4
0
import persistqueue
import random

mqueue = persistqueue.UniqueQ("random", multithreading=True)


def getrandomstring(length=10):
    mstring = "abcdefghijklmnopqrstuvqxyz123456789"
    return "".join([mstring[random.randint(0, 34)] for x in range(length)])


def populatequeue(numentries=100):
    for i in range(numentries):
        mqueue.put(getrandomstring())


def getvalue():
    return mqueue.get()


def getvalues(numvalues):
    return [getvalue() for x in range(numvalues)]
コード例 #5
0
def getUsersToTreat():
    users = persistqueue.UniqueQ(
        '/warehouse/COMPLEXNET/ltarrade/ressources/users_persistQueue',
        auto_commit=True)
    return users
コード例 #6
0
def repec_scraper(db_session,
                  cache,
                  seed_handles,
                  max_links=100,
                  persist_at=settings.CACHE_LOCATION):
    if not persist_at.endswith("/"):
        persist_at = persist_at + "/"

    # Initialize Queue object to store RePEC handles; fill it with seed handles.
    repec_queue = persistqueue.UniqueQ(persist_at + "scraper_queue", auto_commit=True)

    # Step 1: Check if articles db is empty. If it is, then we need to make sure that the
    # scraping queue is empty too.
    citation_chain_count = db_session.query(db.CitationChain).count()
    citation_db_empty = citation_chain_count == 0

    # Clear the shelf and queue
    if citation_db_empty:
        logging.warning("Citations table is empty, so I'm clearing the repec_queue shelf")
        while not repec_queue.empty():
            repec_queue.get(timeout=0)

    # Initiate counter for article entries and link count
    link_count = len(repec_queue) + citation_chain_count

    # Add seed handles to the queue if they haven't been visited previously
    logging.info("Adding seed handles to queue...")
    for seed_handle in seed_handles:
        existing_entry = db_session.query(db.Article).filter_by(handle=seed_handle).first()
        # Because these articles are at the 'root' of the citation chain, the chain list is empty
        if existing_entry is None:
            repec_queue.put(ArticleInfo(seed_handle, []))
            link_count += 1

    # Spider through the queue
    while not repec_queue.empty():
        current = repec_queue.get(timeout=0)
        logging.info("Current queue length now: " + str(len(repec_queue)))
        existing_entry = db_session.query(db.Article).filter_by(handle=current.handle).scalar()
        if existing_entry is None:
            try:
                # Download RePEC data and add current counter value as article ID
                logging.info("Getting RePEC data for " + current.handle)
                article_info = get_repec_data(cache, current.handle)
                write_article(article_info, db_session)
                latest_article_id = db.latest_article_id(db_session)

                # Add current citation chain to db
                updated_citation_chain = current.citation_chain + [latest_article_id]
                write_citation_chain(latest_article_id, updated_citation_chain, db_session)

                # If we are below max_links, then get citec cites and add them to the queue
                if link_count < max_links:
                    logging.info("Getting cites for " + current.handle)
                    cites = get_citec_cites(cache, current.handle)
                    for handle in cites:
                        # Second part takes current citation chain and appends current link counter onto it:
                        # e.g., [1,2] -> [1,2,3].
                        to_put = ArticleInfo(handle, current.citation_chain + [latest_article_id])
                        repec_queue.put(to_put)
                        link_count += 1
                        logging.info("Current value of link_count : " + str(link_count))
                        if link_count > max_links:
                            break
                else:
                    logging.info("No room left in queue; skipping cites for " + current.handle)

            except AttributeError:
                logging.warning("No RePeC data for " + current.handle)

            except json.decoder.JSONDecodeError:
                logging.error("Problem decoding JSON for " + current.handle + ". Skipping this one.")

            except NoDataException:
                logging.warning("CitEc data missing for " + current.handle)

        else:
            # If the handle is already in the database, then we need to add the citation chain again.
            # However, we need to verify that the citation chain doesn't form a cycle, as this would lead
            # the scraper to follow an endless loop.
            updated_citation_chain = current.citation_chain + [existing_entry.id]
            if existing_entry.id not in current.citation_chain:
                write_citation_chain(existing_entry.id, updated_citation_chain, db_session)
            else:
                logging.warning("Potential cycle detected at" + str(updated_citation_chain) + " Skipping " + current.handle)