def scrapeFromQueue(q, config):
    task = q.get()

    try:
        # if there are "None" tasks in the queue, we are done
        if task is None:
            return True

        # get HTML
        with LogTimer("Load and DOM page"):
            htmlDOM = loadPage(task.request)

        # if no hint, need to create one
        if task.hint is None:
            task.hint = parserfactory.getParserHint(task.request)

        # determine how to parse HTML
        parser = parserfactory.ParserFactory(q, config, task.hint)

        # parse HTML
        if parser is not None:
            with LogTimer("Parse page ({})".format(str(task))):
                parser.parse(htmlDOM, task.data)
    except:
        stacktraceText = traceback.format_exc()
        logging.error(stacktraceText)
        with createConnFromConfig(config) as conn, conn.cursor() as cursor:
            cursor.execute(
                'INSERT INTO crawler.error (task, error_description) VALUES (%s, %s)',
                (pickle.dumps(task), stacktraceText)
            )  # TODO: Verify this works after converting to psycopg2. May need to escape_bytea

    q.task_done()

    return False
Exemple #2
0
 def storeJudges(self, judges: List[JudgeInfo]):
     with createConnFromConfig(
             self.config) as conn, conn.cursor() as cursor:
         cursor.copy_from(convertDataToFileLike(judges),
                          'o2cm.judge',
                          columns=('comp_id', 'event_id', 'round_num',
                                   'judge_num', 'judge_name'))
Exemple #3
0
 def storeResults(self, results: List[RoundResult]):
     with createConnFromConfig(
             self.config) as conn, conn.cursor() as cursor:
         cursor.copy_from(convertDataToFileLike(results),
                          'o2cm.round_result',
                          columns=('comp_id', 'event_id', 'round_num',
                                   'dance', 'couple_num', 'placement'),
                          null='')
Exemple #4
0
 def _resetData(self, compId):
     with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Clean {}".format(compId), TimerType.DB):
         cursor.execute("DELETE FROM o2cm.judge WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.round_result WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.round_placement WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.entry WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.event WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.competition WHERE comp_id = %s", (compId, ))
Exemple #5
0
 def storePlacements(self, placements: List[RoundPlacement]):
     with createConnFromConfig(
             self.config) as conn, conn.cursor() as cursor:
         cursor.copy_from(convertDataToFileLike(placements),
                          'o2cm.round_placement',
                          columns=('comp_id', 'event_id', 'round_num',
                                   'dance', 'couple_num', 'judge_num',
                                   'mark'),
                          null='')
Exemple #6
0
 def _storeData(self, compId, compName, compDate):
     with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Save {}".format(compId), TimerType.DB):
         cursor.execute("INSERT INTO o2cm.competition (comp_id, comp_name, comp_date) VALUES (%s, %s, %s)", (compId, compName, compDate))
    args = parseArgs()

    # initialize config
    config = loadConfig(args.configFile[0])

    configureLogging(config)
    logger = logging.getLogger()

    logger.info("Initialized")

    # initialize queue
    q = queue.LifoQueue()

    if args.showExceptions:
        with createConnFromConfig(config) as conn, conn.cursor() as cursor:
            cursor.execute(
                "SELECT error_id, task, error_description FROM crawler.error")
            exceptions = cursor.fetchall()
            for e in exceptions:
                task = pickle.loads(conn.unescape_bytea(e[1]))
                logger.error(task)
                logger.error(e[2])
            exit()

    if args.clearExceptions:
        with createConnFromConfig(config) as conn, conn.cursor() as cursor:
            cursor.execute("DELETE FROM crawler.error")
            exit()

    if args.exceptions:
 def _storeEventEntries(self, eventEntries: List[EventEntry]):
     with createConnFromConfig(self.config) as conn, conn.cursor() as cursor:
         cursor.copy_from(
             convertDataToFileLike(eventEntries),
             'o2cm.entry',
             columns=('comp_id', 'event_id', 'couple_num', 'leader_name', 'follower_name', 'event_placement', 'couple_location'))
 def _storeEvents(self, compEvents: List[EventData]):
     with createConnFromConfig(self.config) as conn, conn.cursor() as cursor:
         cursor.copy_from(
             convertDataToFileLike(compEvents),
             'o2cm.event',
             columns=('comp_id', 'event_id', 'event_name', 'event_url', 'event_num'))