def get_all_author(self): cur_url = "http://tuchong.com/contacts/rank/" while True: _logger.debug("opening initial ranking page at %s" % cur_url) rank_page = self.br.open(cur_url, timeout=TIMEOUT).read() soup = BeautifulSoup(rank_page) cur_list = soup.findAll("a", {"data-site-id": True}) for author in cur_list: self.author_to_crawl.append(author['href']) _logger.debug('got author %s' % author['href']) next_page_anchor = soup.findAll('a', 'next') if (len(next_page_anchor) > 1): _logger.fatal( 'multiple next page anchor found, url:(%s), next page number:%d' % (cur_url, len(next_page_anchor))) if (len(next_page_anchor) == 0): break cur_url = next_page_anchor[0]['href'] return self.author_to_crawl
def run(self): while True: self.heartbeat(pending_input=True) self.agent.restart() tasks = self.agent.get_all_crawler_task() my_task = None for task in tasks: if task['id'] % self.shard_count == self.shard_id: my_task = task break if not my_task: _logger.debug('no task for process shard %d' % self.shard_id) time.sleep(10) continue self.heartbeat(pending_input=False) _logger.debug("Got task:%s" % (my_task)) try: if task['ttl'] > 1: self.process_hub(task) elif task['ttl'] == 1: self.process_terminal(task) except Exception, err: _logger.error( 'unexpected exception with url(%s):%s, %s' % (task['anchor_url'], err, traceback.format_exc())) finally:
def fit(self, X, y): self.reset() size = len(y) for i in xrange(size): if (i + 1) % 10000 == 0: _logger.debug("%d processed" % (i+1)) terms = X[i] domain = y[i] self.training_sentence_count += 1 terms = terms.split(' ') self.domain_count[domain] += 1 term_set = set() for term in terms: term = self.get_category(term) if term in term_set: continue term_set.add(term) self.terms.add(term) self.count[term, domain] += 1 self.count[domain] += 1 self.term_count[term] += 1 self.domain_has[domain].add(term) for domain in self.domain_has: backoff = len(self.domain_has[domain]) * self.alpha / self.count[domain] backoff /= len(self.term_count) - len(self.domain_has[domain]) self.domain_backoff[domain] = backoff self.domains = self.domain_backoff.keys()
def test(model, test_file_path): total = 0 correct = 0 decoder = NaiveDecoder(model) outfile = open("predicted.dat", 'w') _logger.info("Testing %s" % test_file_path) with open(test_file_path) as test_file: processed = 1 for line in test_file: line = line.strip().decode('utf-8') if not line: continue total += 1 sentence, tag = line.split('\t') #sentence = extract(sentence) result = decoder.decode(sentence) predicted, _ = conv.argmax(result.items()) outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), predicted.encode('utf-8'), tag.encode('utf-8'))) if predicted == tag: correct += 1 if processed % 1000 == 0: _logger.debug("%d lines processed" % processed) processed += 1 outfile.close() _logger.info("accuracy: %f" % (float(correct) / total))
def _sleep(self, name): from time import sleep if name in self._wp_config['sleep']: _logger.debug('sleep %d seconds for %s' % (self._wp_config['sleep'][name],name)) sleep(self._wp_config['sleep'][name]) else: _logger.debug('sleep 0.5 seconds for ' + name) sleep(0.5)
def collect_last_term(self, X): X_last = list() tokens = self.last_vec.build_tokenizer() _logger.debug("Extracting last term for each sentence") for sent in X: X_last.append(tokens(sent)[-1]) _logger.debug("Fitting last-term vectorizer") return X_last
def grab_image_info_group(self, soup): all_info = [] all_img = soup.findAll('figure', 'post-photo') _logger.debug('%d images in group' % len(all_img)) for img in all_img: url = img.find('a')['href'] all_info.append(self.grab_image_info(url)[0]) _logger.debug('sleeping for 5 sec') time.sleep(5) return all_info
def main(): _logger.info("wee indexer started") agent = WeeSQLAgent(DB_NAME, DB_USER, DB_PASSWORD) agent.start() _logger.info("MySQL agent started") indexer = Indexer(agent) while True: #agent.restart() indexer.index_new_wee() _logger.debug("Sleep for %d sec" % SLEEP_SEC) time.sleep(SLEEP_SEC)
def bad_proxy(proxy): proxy_log = agent.get_proxy_log(proxy) if proxy_log == None or proxy_log['use_count'] < PROXY_TRYOUT_COUNT \ or float(proxy_log['fail_count']) / float(proxy_log['use_count']) < VALID_PROXY_FAIL_RATE: return False else: _logger.debug( "bad proxy: addr=%s, use=%d, fail=%d, fail_rate=%.2f%%" % (proxy['addr'], proxy_log['use_count'], proxy_log['fail_count'], float(proxy_log['fail_count']) / float(proxy_log['use_count']) * 100)) return True
def crawl_authors(self, authors, callback): for author in authors: cur_url = author _logger.info("crawling author from %s" % cur_url) try: self.crawl_one_author(cur_url, callback) _logger.debug('sleeping for 5 sec') time.sleep(5) except Exception, err: _logger.error( "crawl one author failed, url:(%s), error:%s, %s" % (cur_url, err, traceback.format_exc())) continue
def pick_proxy_for_slot(agent, slot_id, all_proxy): proxies = [proxy for proxy in all_proxy if proxy['slot_id'] == None] if len(proxies) == 0: _logger.error("No free proxy for slot %d" % slot_id) return for proxy in proxies: if not bad_proxy(proxy): _logger.debug("got healthy proxy at %s" % proxy['addr']) agent.update_proxy_slot(slot_id, proxy) proxy['slot_id'] = slot_id return _logger.error("Can't find any decent proxy for slot %d" % slot_id)
def main(): _logger.info("checking dict from %s" % DICT_FILE_PATH) agent = WeeSQLAgent(DB_NAME, DB_USER, DB_PASSWORD) agent.start() unindexed_terms = [] dict_file = open(DICT_FILE_PATH, 'a+') # load all data exists = [ term.split(' ')[1] for term in [line for line in dict_file.read().split('\n')] if term != '' ] _logger.info("%d term exists in old dict" % len(exists)) terms = agent.get_all_custom_tags() _logger.info("checking %d custom tags" % len(terms)) for term in terms: text = term['tag'] _logger.debug('checking %s' % text) if text.find( ' ' ) == -1 and text not in exists: # ignore if text contains space _logger.info("adding %s to dict" % text) dict_file.write("%d %s\n" % (len(text.decode('utf-8')), text)) unindexed_terms.append(text) dict_file.flush() os.fsync(dict_file.fileno()) dict_file.close() _logger.info("dict updated") if len(unindexed_terms) > 0: _logger.info("unindexed terms:(%s)" % ",".join(unindexed_terms)) # must import here rather than in the beginning of file # because dict file will be read only when Indexer is imported and # we've just updated the dict # from indexer import Indexer # _logger.info("need to update index for %d terms" % len(unindexed_terms)) # time.sleep(5) # indexer = Indexer(agent) # indexer.update_index_for_terms(unindexed_terms) else: _logger.info("no new tags found") agent.stop()
def crawl_second(self, url): self._randsleep() _logger.debug('openning url:%s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312")) for anchor in soup.findAll('a'): try: href = anchor['href'] # Ignore internal links if href[:4] != "http" or href.find('hao123.com') != -1: continue self.output.write(' %s %s\n' % (href.encode('utf8'), anchor.text.encode('utf8'))) except Exception, err: _logger.error('got error with anchor(%s): %s' % (str(anchor), err))
def fetch_source(self, source): _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) _logger.info("crawling source id=%d url=%s" % (source['id'], source['url'])) cur_time = int(time.time()) last_crawl_time = source['last_crawl_time'] if cur_time - last_crawl_time < HOUR: _logger.info("ignore source(%s), last crawled %d minutes ago" % (source['url'], (cur_time - last_crawl_time) / 60)) return try: _logger.debug("fetching feed from (%s)" % source['url']) p = feedparser.parse(source['url']) _logger.debug("fetched from (%s)" % source['url']) if p.feed.has_key( 'updated_parsed') and p.feed.updated_parsed != None: cur_feed_time = int(time.mktime(p.feed.updated_parsed)) else: cur_feed_time = int(time.time( )) # FeedParser doesn't understand the 'updated' field # of this feed, neither can we. Probabaly some CJK chars. db_feed_time = source['last_feed_time'] if db_feed_time >= cur_feed_time: _logger.info( "ignore source(%s), no new feed. Last feed:%s, cur feed:%s" % (source['url'], datetime.fromtimestamp(db_feed_time), datetime.fromtimestamp(cur_feed_time))) self.agent.update_source_time(source) else: _logger.info("processing %d entries from %s" % (len(p.entries), source['url'])) for entry in p.entries: self.process_entry(entry, source) self.agent.update_source_time(source, cur_feed_time) _logger.debug( "source(%s) updated: %s" % (source['url'], datetime.fromtimestamp(cur_feed_time))) _logger.info("source(id=%d) success" % source['id']) _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) except Exception, err: _logger.error( "crawling faild for source id=%d, %s: %s" % (source['id'], source['url'], traceback.format_exc()))
class FeedCrawler(object): def __init__(self, agent, pool): self.agent = agent self.pool = pool def grab_image(self, html, entry): wee_url = entry.link.encode('utf-8') soup = BeautifulSoup(html, fromEncoding="utf-8") img = soup.find('img', src=True) if img == None: _logger.debug("%s has no image inside" % wee_url) return url = img['src'] _logger.debug('downloading image from %s' % url) try: br = pbrowser.get_browser() image = br.download_image(url, base_url=wee_url).read() except Exception, err: _logger.error("downloading image failed(%s), baseurl(%s): %s" % (url, wee_url, traceback.format_exc())) return try: self.agent.add_wee_image(wee_url, image) _logger.debug("imaged added for wee:%s" % wee_url) except Exception, err: _logger.error("db error, failed to add image for wee %s: %s" % (wee_url, err))
class BrowserSinaWeibo: def __init__(self, user=None): self.user = user self.selenium = selenium('localhost', 4444, 'chrome', 'http://www.baidu.com') _logger.info('starting selenium') self.selenium.start() self.selenium.set_timeout(120 * 1000) # timeout 120 seconds def _wait_load(self, minutes=1): MIN = 60 * 1000 try: self.selenium.wait_for_page_to_load(timeout=MIN * minutes) except: _logger.error( 'error waiting page to load(%d min), will continue:%s' % (minutes, err)) def login_sina_weibo(self): _logger.debug('logging in to t.cn') TEN_MIN = 10 * 60 * 1000 try: _logger.debug('try logging out, just in case') self.selenium.click(u'link=退出') self._wait_load() except Exception, err: _logger.debug('clicking loging out link failed') # Open sina Weibo _logger.debug('opening login page of http://t.sina.com.cn') self.selenium.open('http://t.sina.com.cn') self._wait_load() self.selenium.window_maximize() _logger.debug('filling login form') try: self.selenium.type('id=loginname', self.user.uname) self.selenium.type('id=password', self.user.passwd) self.selenium.type('id=password_text', self.user.passwd) self.selenium.uncheck('id=remusrname') self.selenium.click('id=login_submit_btn') except Exception, err: dumppath = util.dump2file_with_date( self.selenium.get_html_source()) raise Exception( 'filling t.cn login form failed: %s, page dumped to %s' % (err, dumppath))
def grab_image_info(self, img_url): info = { "title": "", "description": "", "binary": "", "author": "", "tags": "", "ext": "", "popularity": "" } _logger.debug("opening image URL: %s" % img_url) try: self.br.open(img_url, timeout=TIMEOUT) except Exception, err: _logger.error('failed to open url: %s' % img_url) return info
def grab_image(self, html, entry): wee_url = entry.link.encode('utf-8') soup = BeautifulSoup(html, fromEncoding="utf-8") img = soup.find('img', src=True) if img == None: _logger.debug("%s has no image inside" % wee_url) return url = img['src'] _logger.debug('downloading image from %s' % url) try: br = pbrowser.get_browser() image = br.download_image(url, base_url=wee_url).read() except Exception, err: _logger.error("downloading image failed(%s), baseurl(%s): %s" % (url, wee_url, traceback.format_exc())) return
def spam_one_blog(self, anchor, href, target_url): if target_url.find('/interstitial?url=') != -1: _logger.debug('stripped %s to %s' % (target_url, target_url[len('/interstitial?url='):])) target_url = target_url[len('/interstitial?url='):] error = '' retry = 0 # Open blog post page browser = pbrowser.get_browser() while retry < 5: try: res = browser.open(target_url, timeout=10) html = res.read() break except Exception, err: error += 'open blog url failed (%d / 5):%s\n' % (retry + 1, err) retry += 1
def process_content(self, entry): _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) text = '' html = '' if entry.has_key('summary_detail'): content = entry.summary_detail if content.type == u"text/plain" and text == '': text = content.value.encode('utf-8') elif content.type == u"text/html" and html == '': html = content.value.encode('utf-8') elif entry.has_key('summary'): html = entry.summary.encode('utf-8') if html == '' and text == '': _logger.error("failed to get text for entry %s" % entry.link.encode('utf-8')) return text, html
def train(self): self.count = defaultdict(int) c = 0 with open(self.train_path) as infile: for line in infile: line = line.strip() if not line: continue terms, domain = line.split('\t') term_set = set() for term in terms.split(' '): term = term_category(term) if term not in term_set: term_set.add(term) self.count[(term, domain)] += 1 c += 1 if c % 10000 == 0: _logger.debug("%d records processed" % c)
def _crawl_fourth(self, url): page = 1 while True: _logger.debug('fourth layer page %d (%s)' % (page, url)) page += 1 self._randsleep() html = self.br.open(url).read() html = util.convert_to_utf8(html, 'gb2312') soup = BeautifulSoup(html) for td in soup.findAll('td', 'f'): self.output.write(' %s\n' % td.find('a').text.encode('utf-8')) self.output.flush() try: url = soup.find('font', 'f9').find(text=u"下一页").parent()['href'] except: break
def add_wee(self, source_id, url, title, text, html, updated_time, author='', tags=[]): self.cursor.execute( 'insert into wee(source_id, url, title, text, updated_time, author, html) \ values(%s, %s, %s, %s, %s, %s, %s)', (source_id, url, title, text, updated_time, author, html)) for tag in tags: try: self.cursor.execute('insert into wee_tag values(%s, %s)', (url, tag)) except Exception, err: _logger.debug("DB failed adding wee tag: %s" % err)
def cv(self, fold): size = len(self.y) kf = cross_validation.KFold(size, fold, shuffle=True) iteration = 0 scores = list() for train_idx, test_idx in kf: X = [self.X[idx] for idx in train_idx] y = [self.y[idx] for idx in train_idx] X_test = [self.X[idx] for idx in test_idx] y_test = [self.y[idx] for idx in test_idx] _logger.debug("Training...") self.fit(X, y) _logger.debug("Testing...") score = self.get_test_accuracy(X_test, y_test) scores.append(score) iteration += 1 _logger.info("CV iteration %d: CV accuracy: %f" % \ (iteration, score)) scores = np.array(scores) return scores.mean(), scores.std()
def recursive_crawl(url, encoding, selenium, agent, domain, terminate): if crawled_as_hub(agent, url, day_limit=3): _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (url)) return links = pbrowser.get_all_href(url, encoding) _logger.debug("processing %d links" % (len(links))) count = 0 for idx, link in enumerate(links): # ignore href to different domain; accept all href if 'domain' is empty string if urlparse(link['href'].encode('utf-8')).netloc.find(domain) == -1: _logger.debug('ignore (%s), different from domain (%s)' % (link['href'].text.encode('utf-8'), domain)) continue tweet = None try: #tweet = try_crawl_href(link, encoding, agent, selenium) tweet = try_crawl_href(link['href'].encode('utf-8').lower(), link.text.encode('utf-8').strip(), encoding, agent, selenium) except Exception, err: _logger.error('crawl href failed: %s, %s' % (err, traceback.format_exc())) continue if tweet != None: count += 1 try: agent.add_crawled_tweet(url, tweet) _logger.info( 'new tweed added to db, %d total, (%d / %d) prcessed' % (count, idx, len(links))) except Exception, err: _logger.error('failed to add crawled tweet to DB: %s' % err)
def crawl_href(anchor_url, anchor_text, encoding, selenium): tweet = Tweet() tweet.href = anchor_url tweet.title = anchor_text # get content _logger.debug('extracting content from (%s)' % tweet.href) content = pbrowser.extract_main_body(tweet.href, selenium, encoding) if content == '': # we dare not to deal with article without words return None else: tweet.content = content.encode('utf-8') # get image _logger.debug('trying to grab the main image from webpage, hint:(%s)' % tweet.title) image_url = '' image = None try: image, image_url = pbrowser.get_main_image_with_hint( url=tweet.href, hint=tweet.title, selenium=selenium, hint_encoding=encoding) _logger.debug('image url: %s' % image_url) except Exception, err: _logger.error( 'failed to grab image from %s: %s,%s' % (tweet.href, unicode(err).encode('utf-8'), traceback.format_exc()))
def train_pair(self, p, q): if p > q: p, q = q, p p_len = len(self.by_domain_data[p]) q_len = len(self.by_domain_data[q]) _logger.info("Training SVM for %s V.S. %s, %d + %d = %d recored" % \ (p, q, p_len, q_len, p_len + q_len)) X = list(self.by_domain_data[p]) X.extend(self.by_domain_data[q]) y = [p] * p_len y.extend([q] * q_len) pipeline = Pipeline([ ("vert", TfidfVectorizer(min_df = 1, binary = False, ngram_range = (1, 1), tokenizer = Tokenizer())), ("svm", LinearSVC(loss='l2', penalty="l1", dual=False, tol=1e-3)), ]) if self.cv > 0: _logger.info("Doing grid search on %d fold CV" % self.cv) params = { "svm__C": [1, 10, 50, 100, 500, 1000], } grid = GridSearchCV(pipeline, params, cv=self.cv, verbose=50) grid.fit(X, y) pipeline = grid.best_estimator_ _logger.info("Grid search got best score:%f" % grid.best_score_) pipeline.accur = grid.best_score_ else: pipeline.fit(X, y) _logger.debug("Testing on training data") accur = accuracy_score(y, pipeline.predict(X)) pipeline.accur = accur _logger.info("Trainig accuracy (%s - %s): %f" % (p, q, accur)) self.svms[p,q] = pipeline return pipeline
def process_entry(self, entry, source): _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) url = entry.link.encode('utf-8') if self.agent.wee_exists(url): _logger.debug("ignore existed wee with url:%s" % url) return _logger.debug("processing entry from (%s)" % url) title = entry.title.encode('utf-8') if entry.has_key('author'): author = entry.author.encode('utf-8') else: author = '' if entry.has_key('updated_parsed') and entry.updated_parsed != None: updated_time = int(time.mktime(entry.updated_parsed)) else: updated_time = int(time.time( )) # FeedParser doesn't understand the 'updated' field # of this feed, neither can we. Probabaly some CJK chars. text, html = self.process_content(entry) if entry.has_key('tags'): tags = [tag.term.encode('utf-8') for tag in entry.tags] else: tags = [] try: self.agent.add_wee(source['id'], url, title, text, html, updated_time, author, tags) except Exception, err: _logger.error("DB failed to add wee: %s" % traceback.format_exc())
def get_all_friend(self, callback=None): profile_page = self.selenium.get_location() _logger.debug('copy location url: %s' % profile_page) _logger.debug('loading attentions page') self.selenium.click('id=attentions') self._wait_load() soup = BeautifulSoup(self.selenium.get_html_source()) friends = [ self._create_user_from_attention_list(i) for i in soup.findAll('li', 'MIB_linedot_l') ] while True: try: self.selenium.click(u'下一页') except Exception, err: _logger.info('failed to load next page: %s' % err) soup = BeautifulSoup(self.selenium.get_html_source()) for li in soup.findAll('li', 'MIB_linedot_l'): friends.append(self._create_user_from_attention_list(li)) if callback != None: callback(li)
def clean(X, y, k=10): _logger.info("cleaning base on %d-fold cross validation" % k) size = len(y) kf = KFold(size, n_folds=k, shuffle=True) fold = 1 for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] pipeline = Pipeline([ ("vert", TfidfVectorizer(min_df = 1, binary = True, ngram_range = (1, 3), tokenizer = Tokenizer())), ("clf", LinearSVC(loss='l1', penalty="l2", multi_class="ovr", class_weight="auto")), ]) _logger.debug("Training fold %d" % fold) pipeline.fit(X_train, y_train) _logger.debug("Predicting for fold %d" % fold) y_pred = pipeline.predict(X_test) _logger.info("fold %d got accuracy: %f" % (fold, accuracy_score(y_test, y_pred))) right_f = open("fold%d.right.dat" % fold, "w") wrong_f = open("fold%d.wrong.dat" % fold, "w") size = len(y_test) for i in xrange(size): sent, pred, gold = X_test[i].encode('utf-8'), y_pred[i].encode('utf-8'), y_test[i].encode('utf-8') if pred != gold: wrong_f.write("%s\t%s\t%s\n" % (pred, gold, sent)) else: right_f.write("%s\t%s\n" % (sent, gold)) right_f.close() wrong_f.close() fold +=1
def login_sina_weibo(self): _logger.debug('logging in to t.cn') TEN_MIN = 10 * 60 * 1000 try: _logger.debug('try logging out, just in case') self.selenium.click(u'link=退出') self._wait_load() except Exception, err: _logger.debug('clicking loging out link failed')
def crawl(self, callback=None, author_file=None): _logger.debug("browser init finished") self.author_to_crawl = [] if author_file == None: authors = self.get_all_author() # Dump authors to local file with open("author_list", 'w') as output: output.write("\n".join(authors)) else: with open(author_file, 'r') as author_file_input: authors = author_file_input.read().split() for author in authors: author = author.strip() if len(author) > 0: self.author_to_crawl.append(author) amount = int( math.ceil(len(self.author_to_crawl) / float(self.shard_count))) start = self.shard_id * amount self.author_to_crawl = self.author_to_crawl[start:start + amount] _logger.info("crawling %d to %d" % (start, start + amount)) self.crawl_authors(self.author_to_crawl, callback)
def add_to_db(agent, info): if info['binary'] == '': _logger.error('info binary is empty string, ignore: %s' % info) return bin = info['binary'] md5 = hashlib.md5(bin).hexdigest() if agent.pic_exists(md5): _logger.debug("pic exists in DB, only set popularity") agent.update_popularity(md5, info['popularity']) return dirpath, filepath = get_file_path_by_date(datetime.now()) if not os.path.exists(dirpath): os.mkdir(dirpath) filepath = filepath + "." + info['ext'] with open(filepath, "w") as output: output.write(bin) agent.add_pic(filepath, info['title'], info['description'], info['author'], info['tags'], md5, info['popularity'])
def pop_tweet_stack(self, email): if email != None: self.cursor.execute( 'select tweet_id from tweet_stack where user_email = %s', email) else: self.cursor.execute( 'select tweet_id from tweet_stack order by tweet_id desc') if self.cursor.rowcount == 0: _logger.debug('failed to pop tweet stack, it\'s empty, email=%s' % email) return None if email == None: all_rows = list(self.cursor.fetchall()) random.shuffle(all_rows) else: all_rows = self.cursor.fetchall() for cur_row in all_rows: tweet_id = cur_row['tweet_id'] self.cursor.execute('delete from tweet_stack where tweet_id = %s', tweet_id) self.conn.commit() _logger.debug('tweet stack popped, it=%d' % tweet_id) self.cursor.execute('select * from tweet_crawled where id = %s', tweet_id) if self.cursor.rowcount == 0: _logger.debug( 'failed to find corresponding raw tweet with id = %s' % tweet_id) continue raw_tweet = self.cursor.fetchone() t = Tweet(title=raw_tweet['title'], content=raw_tweet['content'], href=raw_tweet['href'], image_ext=raw_tweet['image_ext'], image_bin=raw_tweet['image_bin']) return t _logger.debug('all tweet in stack tried, none ID found in DB') return None
def crawl_one_author(self, url, callback): page = 1 while True: _logger.info("openning page URL: %s" % url) self.br.open(url, timeout=TIMEOUT) soup = BeautifulSoup(self.br.response().read()) url = self.br.geturl() img_div = soup.findAll('div', 'images') imgs = list( itertools.chain( *[div.findAll('a', target='_blank') for div in img_div])) imgs.extend(soup.findAll('a', {'data-location': 'content'})) _logger.debug("%d images on this page" % len(imgs)) for a in imgs: img_url = a['href'] if img_url in self.crawl_history: _logger.debug('ignoring crawled URL: %s' % img_url) continue info = None try: all_info = self.grab_image_info(img_url) self.logfile.write(img_url + '\n') self.logfile.flush() _logger.debug('image processed %s' % img_url) except Exception, err: _logger.error( 'processing one image url failed, url:%s, %s' % (img_url, err)) else: for info in all_info: try: if callback != None: callback(info=info) except Exception, err: _logger.error( 'callback failed, image url: %s, %s, %s' % (img_url, err, traceback.format_exc())) _logger.debug('sleeping for 5 sec') time.sleep(5)
def transform(self, X, y = None): #return self.count_vec.transform(X) _logger.debug("Doing tfidf transform") Xc = self.count_vec.transform(X) X_last = self.collect_last_term(X) _logger.debug("Doing last term transform") Xl = self.last_vec.transform(X_last) _logger.debug("stacking features") ret = sparse.hstack([Xc, Xl]) tokens = self.count_vec.build_tokenizer() l = list() for sent in X: terms = tokens(sent) l.append(1 if ("__LOCATION__" in terms and "__ORGNIZATION__" in terms) else 0) l = np.array(l) l.shape = len(l), 1 ret = sparse.hstack([ret, l]) _logger.debug("vectorization transform done") return ret
def add_wee_source(self, url, pr=0, tags=[]): source_id = self.get_source_id(url) if source_id == -1: _logger.debug("source(%s) doesn't exist, will insert" % url) self.cursor.execute( "insert into wee_source(url, pr) values(%s, %s)", (url, pr)) self.conn.commit() source_id = self.get_source_id(url) _logger.debug("source(%s) id fetched: %d" % (url, source_id)) for tag in tags: tag = tag.encode('utf-8') self.cursor.execute("insert ignore into source_tag values(%s, %s)", (source_id, tag)) self.conn.commit() _logger.debug('tag %s added to source %d' % (tag, source_id))
def try_crawl_href(anchor_url, anchor_text, encoding, agent, selenium): _logger.debug('crawling anchor (%s), URL: %s' % (anchor_text, anchor_url)) # filters # ignore bad-looking anchors if util.chinese_charactor_count(anchor_text.decode('utf-8')) < 10: _logger.debug('too few chinese chars in anchor text, ignoring') return None # ignore same href crawled recently if crawled_as_terminal(agent, anchor_url, anchor_text, 30): _logger.debug('ignore %s, same href was crawled %d days ago' % (anchor_url, (now - last_crawled_at).days)) return None tweet = crawl_href(anchor_url, anchor_text, encoding, selenium) _logger.info('crawl_href finished, anchor-text:(%s)' % anchor_text) return tweet
def fit(self, X, y = None): _logger.debug("Fitting count vectorizer") self.count_vec.fit(X) X_last = self.collect_last_term(X) self.last_vec.fit(X_last) return self