def crawl_category(self): fetcher = Fetcher() kk = yield fetcher.fetch( "http://www.carters.com/%s?startRow=0&sz=all" % self.slug) page = kk.body self._process(page)
def loadGamelogs(self, year=None): """ Loads gamelogs for the player for a given year Arguments: year : The season desired. Defaults to the current year if not specified """ if year is None: year = datetime.datetime.now().year if year not in self.logs: self.logs[year] = [] if 'primary_position' not in self: logger.error("no primary position attribute for " % self) return False url = Fetcher.MLB_PITCHER_URL if self['primary_position'] == 1 else Fetcher.MLB_BATTER_URL f = Fetcher(url, player_id=self.player_id, year=year) j = f.fetch() try: if self['primary_position'] == 1: parent = j['mlb_bio_pitching_last_10']['mlb_individual_pitching_game_log']['queryResults'] else: if 'mlb_individual_hitting_last_x_total' in j: parent = j['mlb_individual_hitting_last_x_total']['mlb_individual_hitting_game_log']['queryResults'] else: parent = j['mlb_bio_hitting_last_10']['mlb_individual_hitting_game_log']['queryResults'] except KeyError, e: logger.error('no key for gamelogs found in %s' % f.url) return False
def __init__(self, year, month, day = None): """ Constructor Arguments: year: The... year! month: The... month! day: The... day! (or None for all days of the month) """ days = [] if day is None: for d in xrange(1, calendar.mdays[month] + 1): days.append(datetime.date(year, month, d)) else: days.append(datetime.date(year, month, day)) begin = days[0] end = days[-1] f = Fetcher(Fetcher.MLB_TRANSACTION_URL, start=begin.strftime("%Y%m%d"), end=end.strftime("%Y%m%d")) try: obj = f.fetch() if obj['transaction_all']['queryResults']['totalSize'] == 0: return results = obj['transaction_all']['queryResults']['row'] if type(results) is dict: self.append(results) else: for row in results: self.append(row) except (ValueError, KeyError), e: logger.error("ERROR %s on %s" % (e, f.url)) pass
def run(self): """The starting point of a thread""" # print("Thread " + str(self.thread_id) + " started") while True: # print("get next URL") self.dash.print_cur_stat("Next_Url___", self.thread_id) value, current_url, current_dns = self.frontier.get_url(self.thread_id) if not current_url: # print("Empty Queue from thread " + str(self.thread_id)) self.dash.print_cur_stat("Empty_Queue", self.thread_id) continue self.dash.print_cur_stat("Downloading", self.thread_id) code, links, content = Fetcher.fetch(current_url) if code == -1: # print("Refused from thread " + str(self.thread_id)) self.dash.print_cur_stat("Refused_Url", self.thread_id) self.refused += 1 self.dash.print_refused(str(self.refused), self.thread_id) continue self.dash.print_cur_stat("Valid_Url__", self.thread_id) # Crawling this link successeded # print("URL got from thread " + str(self.thread_id)) out_links = len(links) sz_parent = len(content) links_mod = [] for i in range(len(links)): links_mod.append((links[i][0], links[i][1], (out_links, sz_parent, len(links[i][0]), value))) self.dash.print_cur_stat("URL_Fetched", self.thread_id) self.crawled += 1 self.dash.print_crawled(str(self.crawled), self.thread_id) # print("URL fetched from thread " + str(self.thread_id)) self.frontier.push_to_serve(links_mod, self.thread_id) Storage.cache_crawled_url(current_url, current_dns, content, self.thread_id)
def _get_category_page(self): fetcher = Fetcher() ret = yield fetcher.fetch('http://www.6pm.com/%s' % self.slug) body = PQ(ret.body) foo = body('.last a')[0].get('href') max_page = int(re.findall('-page(\d+)', foo)[0]) for i in range(max_page): self._crawl_category_page(i)
def _crawl_url(self, url): fetcher = Fetcher() ret = yield fetcher.fetch(url) body = PQ(ret.body) products = body('a.product') data = [] for product in products: foo = PQ(product) origin_price = re.findall('\$([\d\.]+)', foo('.discount').text()) if origin_price: origin_price = origin_price[0] sales_price = foo('.price-6pm').text().replace('$', '').strip() if not origin_price and not sales_price: continue title = '[%s] %s' % (foo('.brandName').text(), foo('.productName').text()) data.append({ 'image': foo('.productImg').attr('src'), 'link': parse_url('http://www.6pm.com' + foo('a').attr('href')), 'title': title, 'original_price': origin_price or sales_price, 'sales_price': sales_price }) data = { 'website': '6pm', 'currency': 'USD', 'country': 'USA', 'store_id': self.store_id, 'data': json.dumps(data) } data.update(self._extra_kwargs) q = yield fetcher.fetch( 'http://127.0.0.1:8000/ezlookup/deal/?key=998998998', method="POST", data=data)
def loadYearlies(self): """ Loads yearly and career totals for a player """ if self['primary_position'] == 1 and not self.force_batting: f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id) else: f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id) j = f.fetch() # if the JSON object is empty, bail if len(j.keys()) == 0: return # get yearly totals if self['primary_position'] == 1 and not self.force_batting: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults'] if parent['totalSize'] > 0: records = parent['row'] # accounting for player with only one row if type(records) is dict: records = [records] for row in records: log = {} for key, value in row.iteritems(): log[key] = value # handle each season as a list, so # players with multiple team seasons # get each team for that year # accounted for if row['season'] in self.totals: self.totals[row['season']].append(log) else: self.totals[row['season']] = [log] # get career totals if self['primary_position'] == 1 and not self.force_batting: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults'] if parent['totalSize'] > 0: for key, value in parent['row'].iteritems(): self.career[key] = value
def load(self, loadRosters = False): """ Calls MLB.com server and loads all team information Arguments: loadRosters : If true, rosters will automatically be loaded (more HTTP requests!) """ f = Fetcher(Fetcher.MLB_LEAGUE_URL) for item in f.fetch(): t = team.Team(item) if loadRosters: t.loadRoster() self.teams[t['team_code']] = t
def load(self, loadRosters=False): """ Calls MLB.com server and loads all team information Arguments: loadRosters : If true, rosters will automatically be loaded (more HTTP requests!) """ f = Fetcher(Fetcher.MLB_LEAGUE_URL) for item in f.fetch(): t = team.Team(item) if loadRosters: t.loadRoster() self.teams[t['team_code']] = t
def run(self): """The starting point of a thread""" print("Thread " + str(self.thread_id) + " started") while True: uid, current_url = self.frontier.get_url(self.thread_id) if not current_url: break print("thread " + str(self.thread_id) + " Got a URL") code, _, content = Fetcher.fetch(current_url) if code == -1: print("Unable to fetch link from thread " + str(self.thread_id)) DBDeleteCrawled(0, 'cache_crawled', self.thread_id, self.name, 0, uid).start() continue print("URL Refreshed from thread " + str(self.thread_id)) DBCacheCrawledRevisit(0, 'cache_crawled', self.thread_id, self.name, 0, uid, content).start() print(self.name + " has finished revisiting")
def loadRoster(self): """ Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set. """ f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id']) j = f.fetch() if 'roster_40' not in j: self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % (f.url) return False parent = j['roster_40']['queryResults'] if parent['totalSize'] > 0: for record in parent['row']: player_id = record['player_id'] self.roster[player_id] = player.Player(player_id)
def loadRoster(self): """ Calls MLB.com servers to obtain the complete roster for the team. If call fails, '_error' property is set. """ f = Fetcher(Fetcher.MLB_ROSTER_URL, team_id=self['team_id']) j = f.fetch() if 'roster_40' not in j: self._error = "ERROR on %s: key roster_40 not found (cannot load 40 man roster)" % ( f.url) return False parent = j['roster_40']['queryResults'] if parent['totalSize'] > 0: for record in parent['row']: player_id = record['player_id'] self.roster[player_id] = player.Player(player_id)
class BaseTask(object): __metaclass__ = TaskMeta def __init__(self): self.fetcher = Fetcher() def on_start(self): raise NotImplemented @gen.coroutine def fetch(self, url, next=None, args=(), data_type="html", **kwargs): ''' :param url: URL will be fetched :param next: callback function :param args: arguments passed to callback function after data, if callback function is not specified, args will be ignored. :param data_type: html/json/xml :return: ''' ret = yield self.fetcher.fetch(url, **kwargs) if next: # Use callback next(Response(ret, data_type), *args) # Wrapp in Response Object else: # Use yield raise gen.Return(Response(ret, data_type)) @gen.coroutine def save(self, data): '''save to mongodb, overlay it when you want to change behavior of save :param data: :return: ''' if not isinstance(data, dict): raise TypeError('data must be instance of dict') # add time stamp data.update({'_timestamp': datetime.datetime.utcnow()}) result = yield self._db.insert(data) self._logger.info("Saved: " + str(result))
def begin(): try: f = Fetcher() fetch_result = f.fetch(kjxjr=False) p = Parser() parse_result = p.parse() print(parse_result) db = DB() db.insert(parse_result) r = Report() today = arrow.now() if today.format('dddd') == 'Friday': to_addr = [('*****@*****.**', '张磊'), ('*****@*****.**', '张永泉')] r.send_report(to_addr=to_addr) else: to_addr = [('*****@*****.**', '张磊'), ] r.send_report(to_addr=to_addr) except Exception as e: print(e)
def __init__(self, year, month, day=None): """ Constructor Arguments: year: The... year! month: The... month! day: The... day! (or None for all days of the month) Schedule is a standard dictionary: each day is a key in the format of 'YYYY-MM-DD', each value a list of game dictionaries. """ days = [] if day is None: for d in xrange(1, calendar.mdays[month] + 1): days.append(datetime.date(year, month, d)) else: days.append(datetime.date(year, month, day)) for d in days: key = d.strftime("%Y-%m-%d") if key not in self.keys(): self[key] = [] f = Fetcher(Fetcher.MLB_SCHEDULE_URL, date=d.strftime("%Y%m%d")) try: content = f.fetch(True) if len(content) == 0: continue content = re.sub(r'\t+', '\t', content) content = content.replace('"', '\\"') content = content.replace("'", "\"") content = re.sub(r'\t([\w,_]+):\s', r'"\1":', content) obj = json.loads(content) self[key] = obj except ValueError, e: print "ERROR %s on %s" % (e, f.url) pass
def loadYearlies(self): """ Loads yearly and career totals for a player """ if self['primary_position'] == 1: f = Fetcher(Fetcher.MLB_PITCHER_SUMMARY_URL, player_id=self.player_id) else: f = Fetcher(Fetcher.MLB_BATTER_SUMMARY_URL, player_id=self.player_id) j = f.fetch() # if the JSON object is empty, bail if len(j.keys()) == 0: return # get yearly totals if self['primary_position'] == 1: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_season']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_season']['queryResults'] if parent['totalSize'] > 0: records = parent['row'] # accounting for player with only one row if type(records) is dict: records = [records] for row in records: log = {} for key, value in row.iteritems(): log[key] = value self.totals[row['season']] = log # get career totals if self['primary_position'] == 1: parent = j['mlb_bio_pitching_summary']['mlb_individual_pitching_career']['queryResults'] else: parent = j['mlb_bio_hitting_summary']['mlb_individual_hitting_career']['queryResults'] if parent['totalSize'] > 0: for key, value in parent['row'].iteritems(): self.career[key] = value
def load(self, load_yearlies=False, id=None): """ Calls MLB.com server and loads player information. If call fails, '_error' property is set. Arguments: id : The MLB.com player ID """ if id is None and self.player_id is not None: id = self.player_id self['player_id'] = self.player_id else: raise Exception('No player_id specified') f = Fetcher(Fetcher.MLB_PLAYER_URL, player_id=self.player_id) j = f.fetch() try: records = j['player_info']['queryResults']['totalSize'] except KeyError, e: msg = 'ERROR on %s: totalSize not returned for call' % f.url self._error = msg logger.error(msg) return False
def __init__(self, year, month, day=None): """ Constructor Arguments: year: The... year! month: The... month! day: The... day! (or None for all days of the month) """ days = [] if day is None: for d in xrange(1, calendar.mdays[month] + 1): days.append(datetime.date(year, month, d)) else: days.append(datetime.date(year, month, day)) begin = days[0] end = days[-1] f = Fetcher(Fetcher.MLB_TRANSACTION_URL, start=begin.strftime("%Y%m%d"), end=end.strftime("%Y%m%d")) try: obj = f.fetch() if obj['transaction_all']['queryResults']['totalSize'] == 0: return results = obj['transaction_all']['queryResults']['row'] if type(results) is dict: self.append(results) else: for row in results: self.append(row) except (ValueError, KeyError), e: logger.error("ERROR %s on %s" % (e, f.url)) pass
from fetcher import Fetcher import sys num = int( sys.argv[1] ) code = Fetcher.fetch( num ) print "----------------------------------------------------" print print print code print "----------------------------------------------------"
class Crawler: """ Scans through websites via HTTP GET requests and recursion. """ MAX_DEPTH = 10 def __init__(self): self.fetcher = Fetcher() self.tree = Tree() self.documents = [] self.invalid_documents = [] def get_document_by_guid(self, guid): for document in self.documents: if document.guid == guid: return document def crawl(self, url, current_depth=0): if self.fetcher.url_valid(url): try: html = self.fetcher.fetch(url) parser = Parser(url, html) parser.parse() self.documents.append(parser.document) for token in parser.document.tokens: self.tree.add(token.text, token) print(colored('Document crawled successfully.', 'green')) print('URL: %s' % (parser.document.url)) print('Tokens: %s' % (len(parser.document.tokens))) print('Links: %s' % (len(parser.document.links))) print('\n') # the depth increases with each subsequent crawl call if current_depth <= Crawler.MAX_DEPTH: for link in parser.document.links: document = Document(link) if document not in self.documents and document not in self.invalid_documents: # NOTE: this is not 100% guarantee that the same document will not be crawled again, # as there are many ways to forward DNS to the same page... self.crawl(link, current_depth + 1) # if crawling has been successful, then append the link as crawled parser.document.crawled_links.append(link) except (MissingSchema, InvalidSchema): self.handle_invalid_url(url, 'Invalid URL specified.') except NotFoundException: self.handle_invalid_url(url, 'Document not found.') except ConnectionError: self.handle_invalid_url(url, 'Error while crawling document.') except (SyntaxException, RecursionError): self.handle_invalid_url(url, 'Error while parsing document.') except TimeoutException: self.handle_invalid_url(url, 'Document took too long to process.') else: self.handle_invalid_url(url, 'Invalid URL specified.') def handle_invalid_url(self, url, message): document = Document(url) document.valid = False if document not in self.invalid_documents: self.invalid_documents.append(document) print(colored(message, 'red')) print('URL: %s' % (url)) print('\n')
class Coffer: def __init__(self,config_parser): # Connect to engine database_path = get_from_config_parser(config_parser,'Database','path','database') database_debug = get_boolean_from_config_parser(config_parser,'Database','debug',False) dir = os.path.dirname(database_path) if not os.path.exists(dir): mkdir(dir) sys.stderr.write('Connecting to database at "%s"\n' % database_path) self._engine = create_engine('sqlite:///%s' % database_path,echo=database_debug) # Start session Session = sessionmaker(bind=self._engine) self._session = Session() # Initialize feed storage self._feed_storage = FeedStorage(self._engine,self._session) # Initialize item storage self._item_storage = ItemStorage(self._engine,self._session) # A list of subprocess.Popen processes that will be maintained # by the Coffer object. self._external_processes = [] # File storage (data dump) file_storage_path = get_from_config_parser(config_parser,'FileStorage','path','datadump') max_block_size = get_int_from_config_parser(config_parser,'FileStorage','max-block-size', file_storage.DEFAULT_MAX_BLOCK_SIZE) bzip2_path = get_from_config_parser(config_parser,'FileStorage','bzip2-path','/usr/bin/bzip2') self._file_storage = FileStorage(self._external_processes,file_storage_path, max_block_size,bzip2_path) # Content fetcher configuration self._fetcher = Fetcher(config_parser) def clone_db_session(self): clone_session = sessionmaker(bind=self._engine) return clone_session() def finish(self): ''' Waits for all external processes started by coffer to finish. ''' sys.stderr.write('Waiting for sub-processes to finish..\n') for process in self._external_processes: process.wait() sys.stderr.write(' ..finished.\n\n') def check_processes(self): ''' Checks if some of the external processes have finished and removes them from the external-process list if they have. ''' end_i = len(self._external_processes) i = 0 while i < end_i: if self._external_processes[i].poll() is not None: del self._external_processes[i] end_i -= 1 else: i += 1 def run_command_shell(self): shell = CommandShell(self) shell.cmdloop() def get_feed_info(self,url): ''' Obtain information on an RSS feed, given its URL. The information will be obtained directly from the URL, not from our database. This works for feeds regardless of whether they are stored in our database. ''' feed_results = feedparser.parse(url) sys.stderr.write(str(feed_results)) if 'title' in feed_results.feed: return feed_results.feed.title else: return None def current_items_feed(self, session, feed, enable_ad_filter = False, check_existence = False, debug_enabled = False): ''' Returns a generator for the list of current items, i.e. the current list of fresh items returned by all known feeds. @param enable_ad_filter: if True, advertisements will be filtered out using the predefined regex @param check_existence: if True, only entries that are not already stored in the items database will be returned. ''' if enable_ad_filter and len(feed.ad_filters) > 0: exclude_pattern = re.compile(u'|'.join(feed.ad_filters)) feed_results = feedparser.parse(feed.get_url()) for entry in feed_results.entries: if 'link' not in entry.keys(): sys.stderr.write((u'No link found in this item: "%s"\n' \ % entry.title).encode('utf-8')) if debug_enabled: sys.stderr.write('Keys:\n%s\n' % str(entry.keys())) continue if 'id' not in entry.keys(): if debug_enabled: sys.stderr.write((u'No entry id found in this item: "%s"\n' \ % entry.title).encode('utf-8')) entry_id = entry.link if debug_enabled: sys.stderr.write('Keys:\n%s\n' % str(entry.keys())) sys.stderr.write((u'Using link [%s] instead of id.\n' \ % entry_id).encode('utf-8')) else: entry_id = entry.id if check_existence: if self._item_storage.exists_in_session(session,entry_id): continue if (not enable_ad_filter) or (len(feed.ad_filters) == 0) \ or (not exclude_pattern.search(entry.title)): yield (feed.get_id(),entry_id,entry) def current_items_in_session(self, session, enable_ad_filter = False, check_existence = False, debug_enabled = False): ''' Returns a generator for the list of current items, i.e. the current list of fresh items returned by all known feeds. @param enable_ad_filter: if True, advertisements will be filtered out using the predefined regex @param check_existence: if True, only entries that are not already stored in the items database will be returned. ''' for feed in self._feed_storage.feeds(): for item in self.current_items_feed(session,feed,enable_ad_filter,check_existence,debug_enabled): yield item def current_items(self, enable_ad_filter = False, check_existence = False, debug_enabled = False): ''' Returns a generator for the list of current items, i.e. the current list of fresh items returned by all known feeds. @param enable_ad_filter: if True, advertisements will be filtered out using the predefined regex @param check_existence: if True, only entries that are not already stored in the items database will be returned. ''' for feed in self._feed_storage.feeds(): for item in self.current_items_feed(self._session,feed,enable_ad_filter,check_existence,debug_enabled): yield item def fetch_and_store(self,targets): ''' Download target URLs and store them in the file storage. @param targets: A list of (feed-id,URL) pairs. ''' text_objs_dict = self._fetcher.fetch(targets) self._file_storage.store_all(text_objs_dict)
from fetcher import Fetcher num = 1 while True: x = raw_input( ">> " ) if x == "q": submission = Fetcher.fetch( num ) if submission: num += 1 else: print "No new submission"
def _save(self, data): fetcher = Fetcher() q = yield fetcher.fetch( 'http://127.0.0.1:8000/ezlookup/deal/?key=998998998', method="POST", data=data)