def scrapeFromQueue(q, config): task = q.get() try: # if there are "None" tasks in the queue, we are done if task is None: return True # get HTML with LogTimer("Load and DOM page"): htmlDOM = loadPage(task.request) # if no hint, need to create one if task.hint is None: task.hint = parserfactory.getParserHint(task.request) # determine how to parse HTML parser = parserfactory.ParserFactory(q, config, task.hint) # parse HTML if parser is not None: with LogTimer("Parse page ({})".format(str(task))): parser.parse(htmlDOM, task.data) except: stacktraceText = traceback.format_exc() logging.error(stacktraceText) with createConnFromConfig(config) as conn, conn.cursor() as cursor: cursor.execute( 'INSERT INTO crawler.error (task, error_description) VALUES (%s, %s)', (pickle.dumps(task), stacktraceText) ) # TODO: Verify this works after converting to psycopg2. May need to escape_bytea q.task_done() return False
def storeJudges(self, judges: List[JudgeInfo]): with createConnFromConfig( self.config) as conn, conn.cursor() as cursor: cursor.copy_from(convertDataToFileLike(judges), 'o2cm.judge', columns=('comp_id', 'event_id', 'round_num', 'judge_num', 'judge_name'))
def storeResults(self, results: List[RoundResult]): with createConnFromConfig( self.config) as conn, conn.cursor() as cursor: cursor.copy_from(convertDataToFileLike(results), 'o2cm.round_result', columns=('comp_id', 'event_id', 'round_num', 'dance', 'couple_num', 'placement'), null='')
def _resetData(self, compId): with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Clean {}".format(compId), TimerType.DB): cursor.execute("DELETE FROM o2cm.judge WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.round_result WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.round_placement WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.entry WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.event WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.competition WHERE comp_id = %s", (compId, ))
def storePlacements(self, placements: List[RoundPlacement]): with createConnFromConfig( self.config) as conn, conn.cursor() as cursor: cursor.copy_from(convertDataToFileLike(placements), 'o2cm.round_placement', columns=('comp_id', 'event_id', 'round_num', 'dance', 'couple_num', 'judge_num', 'mark'), null='')
def _storeData(self, compId, compName, compDate): with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Save {}".format(compId), TimerType.DB): cursor.execute("INSERT INTO o2cm.competition (comp_id, comp_name, comp_date) VALUES (%s, %s, %s)", (compId, compName, compDate))
args = parseArgs() # initialize config config = loadConfig(args.configFile[0]) configureLogging(config) logger = logging.getLogger() logger.info("Initialized") # initialize queue q = queue.LifoQueue() if args.showExceptions: with createConnFromConfig(config) as conn, conn.cursor() as cursor: cursor.execute( "SELECT error_id, task, error_description FROM crawler.error") exceptions = cursor.fetchall() for e in exceptions: task = pickle.loads(conn.unescape_bytea(e[1])) logger.error(task) logger.error(e[2]) exit() if args.clearExceptions: with createConnFromConfig(config) as conn, conn.cursor() as cursor: cursor.execute("DELETE FROM crawler.error") exit() if args.exceptions:
def _storeEventEntries(self, eventEntries: List[EventEntry]): with createConnFromConfig(self.config) as conn, conn.cursor() as cursor: cursor.copy_from( convertDataToFileLike(eventEntries), 'o2cm.entry', columns=('comp_id', 'event_id', 'couple_num', 'leader_name', 'follower_name', 'event_placement', 'couple_location'))
def _storeEvents(self, compEvents: List[EventData]): with createConnFromConfig(self.config) as conn, conn.cursor() as cursor: cursor.copy_from( convertDataToFileLike(compEvents), 'o2cm.event', columns=('comp_id', 'event_id', 'event_name', 'event_url', 'event_num'))