Ejemplo n.º 1
0
    def get_all_author(self):
        cur_url = "http://tuchong.com/contacts/rank/"

        while True:
            _logger.debug("opening initial ranking page at %s" % cur_url)
            rank_page = self.br.open(cur_url, timeout=TIMEOUT).read()
            soup = BeautifulSoup(rank_page)

            cur_list = soup.findAll("a", {"data-site-id": True})
            for author in cur_list:
                self.author_to_crawl.append(author['href'])
                _logger.debug('got author %s' % author['href'])

            next_page_anchor = soup.findAll('a', 'next')
            if (len(next_page_anchor) > 1):
                _logger.fatal(
                    'multiple next page anchor found, url:(%s), next page number:%d'
                    % (cur_url, len(next_page_anchor)))

            if (len(next_page_anchor) == 0):
                break

            cur_url = next_page_anchor[0]['href']

        return self.author_to_crawl
Ejemplo n.º 2
0
    def run(self):
        while True:
            self.heartbeat(pending_input=True)
            self.agent.restart()
            tasks = self.agent.get_all_crawler_task()
            my_task = None
            for task in tasks:
                if task['id'] % self.shard_count == self.shard_id:
                    my_task = task
                    break
            if not my_task:
                _logger.debug('no task for process shard %d' % self.shard_id)
                time.sleep(10)
                continue

            self.heartbeat(pending_input=False)
            _logger.debug("Got task:%s" % (my_task))

            try:
                if task['ttl'] > 1:
                    self.process_hub(task)

                elif task['ttl'] == 1:
                    self.process_terminal(task)
            except Exception, err:
                _logger.error(
                    'unexpected exception with url(%s):%s, %s' %
                    (task['anchor_url'], err, traceback.format_exc()))
            finally:
Ejemplo n.º 3
0
    def fit(self, X, y):
        self.reset()
        size = len(y)
        for i in xrange(size):
            if (i + 1) % 10000 == 0:
                _logger.debug("%d processed" % (i+1))
            terms = X[i]
            domain = y[i]
            self.training_sentence_count += 1
            terms = terms.split(' ')
            self.domain_count[domain] += 1
            term_set = set()
            for term in terms:
                term = self.get_category(term)
                if term in term_set:
                    continue
                term_set.add(term)
                self.terms.add(term)
                self.count[term, domain] += 1
                self.count[domain] += 1
                self.term_count[term] += 1
                self.domain_has[domain].add(term)

        for domain in self.domain_has:
            backoff = len(self.domain_has[domain]) * self.alpha / self.count[domain]
            backoff /= len(self.term_count) - len(self.domain_has[domain])
            self.domain_backoff[domain] = backoff

        self.domains = self.domain_backoff.keys()
Ejemplo n.º 4
0
def test(model, test_file_path):
    total = 0
    correct = 0
    decoder = NaiveDecoder(model)
    outfile = open("predicted.dat", 'w')
    _logger.info("Testing %s" % test_file_path)
    with open(test_file_path) as test_file:
        processed = 1
        for line in test_file:
            line = line.strip().decode('utf-8')
            if not line:
                continue
            total += 1
            sentence, tag = line.split('\t')

            #sentence = extract(sentence)

            result = decoder.decode(sentence)
            predicted, _ = conv.argmax(result.items())
            outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), predicted.encode('utf-8'), tag.encode('utf-8')))
            if predicted == tag:
                correct += 1
            if processed % 1000 == 0:
                _logger.debug("%d lines processed" % processed)
            processed += 1
    outfile.close()
    _logger.info("accuracy: %f" % (float(correct) / total))
Ejemplo n.º 5
0
 def _sleep(self, name):
     from time import sleep
     if name in self._wp_config['sleep']:
         _logger.debug('sleep %d seconds for %s' % (self._wp_config['sleep'][name],name))
         sleep(self._wp_config['sleep'][name])
     else:
         _logger.debug('sleep 0.5 seconds for ' + name)
         sleep(0.5)
Ejemplo n.º 6
0
 def collect_last_term(self, X):
     X_last = list()
     tokens = self.last_vec.build_tokenizer()
     _logger.debug("Extracting last term for each sentence")
     for sent in X:
         X_last.append(tokens(sent)[-1])
     _logger.debug("Fitting last-term vectorizer")
     return X_last
Ejemplo n.º 7
0
 def grab_image_info_group(self, soup):
     all_info = []
     all_img = soup.findAll('figure', 'post-photo')
     _logger.debug('%d images in group' % len(all_img))
     for img in all_img:
         url = img.find('a')['href']
         all_info.append(self.grab_image_info(url)[0])
         _logger.debug('sleeping for 5 sec')
         time.sleep(5)
     return all_info
Ejemplo n.º 8
0
def main():
    _logger.info("wee indexer started")
    agent = WeeSQLAgent(DB_NAME, DB_USER, DB_PASSWORD)
    agent.start()
    _logger.info("MySQL agent started")
    indexer = Indexer(agent)
    while True:
        #agent.restart()
        indexer.index_new_wee()
        _logger.debug("Sleep for %d sec" % SLEEP_SEC)
        time.sleep(SLEEP_SEC)
Ejemplo n.º 9
0
def bad_proxy(proxy):
    proxy_log = agent.get_proxy_log(proxy)
    if proxy_log == None or proxy_log['use_count'] < PROXY_TRYOUT_COUNT \
            or float(proxy_log['fail_count']) / float(proxy_log['use_count']) < VALID_PROXY_FAIL_RATE:
        return False
    else:
        _logger.debug(
            "bad proxy: addr=%s, use=%d, fail=%d, fail_rate=%.2f%%" %
            (proxy['addr'], proxy_log['use_count'], proxy_log['fail_count'],
             float(proxy_log['fail_count']) / float(proxy_log['use_count']) *
             100))
        return True
Ejemplo n.º 10
0
 def crawl_authors(self, authors, callback):
     for author in authors:
         cur_url = author
         _logger.info("crawling author from %s" % cur_url)
         try:
             self.crawl_one_author(cur_url, callback)
             _logger.debug('sleeping for 5 sec')
             time.sleep(5)
         except Exception, err:
             _logger.error(
                 "crawl one author failed, url:(%s), error:%s, %s" %
                 (cur_url, err, traceback.format_exc()))
             continue
Ejemplo n.º 11
0
def pick_proxy_for_slot(agent, slot_id, all_proxy):
    proxies = [proxy for proxy in all_proxy if proxy['slot_id'] == None]
    if len(proxies) == 0:
        _logger.error("No free proxy for slot %d" % slot_id)
        return
    for proxy in proxies:
        if not bad_proxy(proxy):
            _logger.debug("got healthy proxy at %s" % proxy['addr'])
            agent.update_proxy_slot(slot_id, proxy)
            proxy['slot_id'] = slot_id
            return

    _logger.error("Can't find any decent proxy for slot %d" % slot_id)
Ejemplo n.º 12
0
def main():
    _logger.info("checking dict from %s" % DICT_FILE_PATH)
    agent = WeeSQLAgent(DB_NAME, DB_USER, DB_PASSWORD)
    agent.start()
    unindexed_terms = []

    dict_file = open(DICT_FILE_PATH, 'a+')

    # load all data
    exists = [
        term.split(' ')[1]
        for term in [line for line in dict_file.read().split('\n')]
        if term != ''
    ]
    _logger.info("%d term exists in old dict" % len(exists))

    terms = agent.get_all_custom_tags()

    _logger.info("checking %d custom tags" % len(terms))

    for term in terms:
        text = term['tag']
        _logger.debug('checking %s' % text)
        if text.find(
                ' '
        ) == -1 and text not in exists:  # ignore if text contains space
            _logger.info("adding %s to dict" % text)
            dict_file.write("%d %s\n" % (len(text.decode('utf-8')), text))
            unindexed_terms.append(text)

    dict_file.flush()
    os.fsync(dict_file.fileno())
    dict_file.close()
    _logger.info("dict updated")

    if len(unindexed_terms) > 0:
        _logger.info("unindexed terms:(%s)" % ",".join(unindexed_terms))
        # must import here rather than in the beginning of file
        # because dict file will be read only when Indexer is imported and
        # we've just updated the dict
        # from indexer import Indexer
        # _logger.info("need to update index for %d terms" % len(unindexed_terms))
        # time.sleep(5)
        # indexer = Indexer(agent)
        # indexer.update_index_for_terms(unindexed_terms)
    else:
        _logger.info("no new tags found")

    agent.stop()
Ejemplo n.º 13
0
    def crawl_second(self, url):
        self._randsleep()
        _logger.debug('openning url:%s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"))        

        for anchor in soup.findAll('a'):
            try:
                href = anchor['href']
                # Ignore internal links
                if href[:4] != "http" or href.find('hao123.com') != -1:
                    continue
                self.output.write('  %s %s\n' % (href.encode('utf8'), anchor.text.encode('utf8')))
            except Exception, err:
                _logger.error('got error with anchor(%s): %s' % (str(anchor), err))
Ejemplo n.º 14
0
    def fetch_source(self, source):
        _logger.debug("pool stat: %d working %d waiting" %
                      (self.pool.running(), self.pool.waiting()))
        _logger.info("crawling source id=%d url=%s" %
                     (source['id'], source['url']))

        cur_time = int(time.time())
        last_crawl_time = source['last_crawl_time']
        if cur_time - last_crawl_time < HOUR:
            _logger.info("ignore source(%s), last crawled %d minutes ago" %
                         (source['url'], (cur_time - last_crawl_time) / 60))
            return

        try:
            _logger.debug("fetching feed from (%s)" % source['url'])
            p = feedparser.parse(source['url'])
            _logger.debug("fetched from (%s)" % source['url'])
            if p.feed.has_key(
                    'updated_parsed') and p.feed.updated_parsed != None:
                cur_feed_time = int(time.mktime(p.feed.updated_parsed))
            else:
                cur_feed_time = int(time.time(
                ))  # FeedParser doesn't understand the 'updated' field
                # of this feed, neither can we. Probabaly some CJK chars.
            db_feed_time = source['last_feed_time']
            if db_feed_time >= cur_feed_time:
                _logger.info(
                    "ignore source(%s), no new feed. Last feed:%s, cur feed:%s"
                    % (source['url'], datetime.fromtimestamp(db_feed_time),
                       datetime.fromtimestamp(cur_feed_time)))
                self.agent.update_source_time(source)
            else:
                _logger.info("processing %d entries from %s" %
                             (len(p.entries), source['url']))
                for entry in p.entries:
                    self.process_entry(entry, source)
                self.agent.update_source_time(source, cur_feed_time)
                _logger.debug(
                    "source(%s) updated: %s" %
                    (source['url'], datetime.fromtimestamp(cur_feed_time)))

            _logger.info("source(id=%d) success" % source['id'])
            _logger.debug("pool stat: %d working %d waiting" %
                          (self.pool.running(), self.pool.waiting()))
        except Exception, err:
            _logger.error(
                "crawling faild for source id=%d, %s: %s" %
                (source['id'], source['url'], traceback.format_exc()))
Ejemplo n.º 15
0
class FeedCrawler(object):
    def __init__(self, agent, pool):
        self.agent = agent
        self.pool = pool

    def grab_image(self, html, entry):

        wee_url = entry.link.encode('utf-8')
        soup = BeautifulSoup(html, fromEncoding="utf-8")
        img = soup.find('img', src=True)
        if img == None:
            _logger.debug("%s has no image inside" % wee_url)
            return
        url = img['src']

        _logger.debug('downloading image from %s' % url)
        try:
            br = pbrowser.get_browser()
            image = br.download_image(url, base_url=wee_url).read()
        except Exception, err:
            _logger.error("downloading image failed(%s), baseurl(%s): %s" %
                          (url, wee_url, traceback.format_exc()))
            return

        try:
            self.agent.add_wee_image(wee_url, image)
            _logger.debug("imaged added for wee:%s" % wee_url)
        except Exception, err:
            _logger.error("db error, failed to add image for wee %s: %s" %
                          (wee_url, err))
Ejemplo n.º 16
0
class BrowserSinaWeibo:
    def __init__(self, user=None):
        self.user = user
        self.selenium = selenium('localhost', 4444, 'chrome',
                                 'http://www.baidu.com')
        _logger.info('starting selenium')
        self.selenium.start()
        self.selenium.set_timeout(120 * 1000)  # timeout 120 seconds

    def _wait_load(self, minutes=1):
        MIN = 60 * 1000
        try:
            self.selenium.wait_for_page_to_load(timeout=MIN * minutes)
        except:
            _logger.error(
                'error waiting page to load(%d min), will continue:%s' %
                (minutes, err))

    def login_sina_weibo(self):
        _logger.debug('logging in to t.cn')
        TEN_MIN = 10 * 60 * 1000
        try:
            _logger.debug('try logging out, just in case')
            self.selenium.click(u'link=退出')
            self._wait_load()
        except Exception, err:
            _logger.debug('clicking loging out link failed')

        # Open sina Weibo
        _logger.debug('opening login page of http://t.sina.com.cn')
        self.selenium.open('http://t.sina.com.cn')
        self._wait_load()
        self.selenium.window_maximize()

        _logger.debug('filling login form')
        try:
            self.selenium.type('id=loginname', self.user.uname)
            self.selenium.type('id=password', self.user.passwd)
            self.selenium.type('id=password_text', self.user.passwd)
            self.selenium.uncheck('id=remusrname')
            self.selenium.click('id=login_submit_btn')
        except Exception, err:
            dumppath = util.dump2file_with_date(
                self.selenium.get_html_source())
            raise Exception(
                'filling t.cn login form failed: %s, page dumped to %s' %
                (err, dumppath))
Ejemplo n.º 17
0
    def grab_image_info(self, img_url):
        info = {
            "title": "",
            "description": "",
            "binary": "",
            "author": "",
            "tags": "",
            "ext": "",
            "popularity": ""
        }

        _logger.debug("opening image URL: %s" % img_url)
        try:
            self.br.open(img_url, timeout=TIMEOUT)
        except Exception, err:
            _logger.error('failed to open url: %s' % img_url)
            return info
Ejemplo n.º 18
0
    def grab_image(self, html, entry):

        wee_url = entry.link.encode('utf-8')
        soup = BeautifulSoup(html, fromEncoding="utf-8")
        img = soup.find('img', src=True)
        if img == None:
            _logger.debug("%s has no image inside" % wee_url)
            return
        url = img['src']

        _logger.debug('downloading image from %s' % url)
        try:
            br = pbrowser.get_browser()
            image = br.download_image(url, base_url=wee_url).read()
        except Exception, err:
            _logger.error("downloading image failed(%s), baseurl(%s): %s" %
                          (url, wee_url, traceback.format_exc()))
            return
Ejemplo n.º 19
0
 def spam_one_blog(self, anchor, href, target_url):
     if target_url.find('/interstitial?url=') != -1:
         _logger.debug('stripped %s to %s' %
                       (target_url, target_url[len('/interstitial?url='):]))
         target_url = target_url[len('/interstitial?url='):]
     error = ''
     retry = 0
     # Open blog post page
     browser = pbrowser.get_browser()
     while retry < 5:
         try:
             res = browser.open(target_url, timeout=10)
             html = res.read()
             break
         except Exception, err:
             error += 'open blog url failed (%d / 5):%s\n' % (retry + 1,
                                                              err)
             retry += 1
Ejemplo n.º 20
0
    def process_content(self, entry):
        _logger.debug("pool stat: %d working %d waiting" %
                      (self.pool.running(), self.pool.waiting()))
        text = ''
        html = ''
        if entry.has_key('summary_detail'):
            content = entry.summary_detail
            if content.type == u"text/plain" and text == '':
                text = content.value.encode('utf-8')
            elif content.type == u"text/html" and html == '':
                html = content.value.encode('utf-8')
        elif entry.has_key('summary'):
            html = entry.summary.encode('utf-8')

        if html == '' and text == '':
            _logger.error("failed to get text for entry %s" %
                          entry.link.encode('utf-8'))
        return text, html
Ejemplo n.º 21
0
 def train(self):
     self.count = defaultdict(int)
     c = 0
     with open(self.train_path) as infile:
         for line in infile:
             line = line.strip()
             if not line:
                 continue
             terms, domain = line.split('\t')
             term_set = set()
             for term in terms.split(' '):
                 term = term_category(term)
                 if term not in term_set:
                     term_set.add(term)
                     self.count[(term, domain)] += 1
             c += 1
             if c % 10000 == 0:
                 _logger.debug("%d records processed" % c)
Ejemplo n.º 22
0
    def _crawl_fourth(self, url):
        page = 1
        while True:
            _logger.debug('fourth layer page %d (%s)' % (page, url))
            page += 1
            self._randsleep()
            html = self.br.open(url).read()
            html = util.convert_to_utf8(html, 'gb2312')
            soup = BeautifulSoup(html)

            for td in soup.findAll('td', 'f'):
                self.output.write('      %s\n' %
                                  td.find('a').text.encode('utf-8'))
                self.output.flush()
            try:
                url = soup.find('font',
                                'f9').find(text=u"下一页").parent()['href']
            except:
                break
Ejemplo n.º 23
0
    def add_wee(self,
                source_id,
                url,
                title,
                text,
                html,
                updated_time,
                author='',
                tags=[]):
        self.cursor.execute(
            'insert into wee(source_id, url, title, text, updated_time, author, html) \
values(%s, %s, %s, %s, %s, %s, %s)',
            (source_id, url, title, text, updated_time, author, html))

        for tag in tags:
            try:
                self.cursor.execute('insert into wee_tag values(%s, %s)',
                                    (url, tag))
            except Exception, err:
                _logger.debug("DB failed adding wee tag: %s" % err)
Ejemplo n.º 24
0
    def cv(self, fold):
        size = len(self.y)
        kf = cross_validation.KFold(size, fold, shuffle=True)
        iteration = 0
        scores = list()
        for train_idx, test_idx in kf:
            X = [self.X[idx] for idx in train_idx]
            y = [self.y[idx] for idx in train_idx]
            X_test = [self.X[idx] for idx in test_idx]
            y_test = [self.y[idx] for idx in test_idx]
            _logger.debug("Training...")
            self.fit(X, y)
            _logger.debug("Testing...")
            score = self.get_test_accuracy(X_test, y_test)
            scores.append(score)
            iteration += 1
            _logger.info("CV iteration %d: CV accuracy: %f" % \
                             (iteration, score))

        scores = np.array(scores)
        return scores.mean(), scores.std()
Ejemplo n.º 25
0
def recursive_crawl(url, encoding, selenium, agent, domain, terminate):
    if crawled_as_hub(agent, url, day_limit=3):
        _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (url))
        return

    links = pbrowser.get_all_href(url, encoding)
    _logger.debug("processing %d links" % (len(links)))
    count = 0
    for idx, link in enumerate(links):
        # ignore href to different domain; accept all href if 'domain' is empty string
        if urlparse(link['href'].encode('utf-8')).netloc.find(domain) == -1:
            _logger.debug('ignore (%s), different from domain (%s)' %
                          (link['href'].text.encode('utf-8'), domain))
            continue

        tweet = None
        try:
            #tweet = try_crawl_href(link, encoding, agent, selenium)
            tweet = try_crawl_href(link['href'].encode('utf-8').lower(),
                                   link.text.encode('utf-8').strip(), encoding,
                                   agent, selenium)
        except Exception, err:
            _logger.error('crawl href failed: %s, %s' %
                          (err, traceback.format_exc()))
            continue

        if tweet != None:
            count += 1
            try:
                agent.add_crawled_tweet(url, tweet)
                _logger.info(
                    'new tweed added to db, %d total, (%d / %d) prcessed' %
                    (count, idx, len(links)))
            except Exception, err:
                _logger.error('failed to add crawled tweet to DB: %s' % err)
Ejemplo n.º 26
0
def crawl_href(anchor_url, anchor_text, encoding, selenium):
    tweet = Tweet()
    tweet.href = anchor_url
    tweet.title = anchor_text

    # get content
    _logger.debug('extracting content from (%s)' % tweet.href)
    content = pbrowser.extract_main_body(tweet.href, selenium, encoding)
    if content == '':  # we dare not to deal with article without words
        return None
    else:
        tweet.content = content.encode('utf-8')

    # get image
    _logger.debug('trying to grab the main image from webpage, hint:(%s)' %
                  tweet.title)
    image_url = ''
    image = None

    try:
        image, image_url = pbrowser.get_main_image_with_hint(
            url=tweet.href,
            hint=tweet.title,
            selenium=selenium,
            hint_encoding=encoding)
        _logger.debug('image url: %s' % image_url)
    except Exception, err:
        _logger.error(
            'failed to grab image from %s: %s,%s' %
            (tweet.href, unicode(err).encode('utf-8'), traceback.format_exc()))
Ejemplo n.º 27
0
    def train_pair(self, p, q):
        if p > q:
            p, q = q, p

        p_len = len(self.by_domain_data[p])
        q_len = len(self.by_domain_data[q])

        _logger.info("Training SVM for %s V.S. %s, %d + %d = %d recored" % \
                         (p, q, p_len, q_len, p_len + q_len))

        X = list(self.by_domain_data[p])
        X.extend(self.by_domain_data[q])
        y = [p] * p_len
        y.extend([q] * q_len)

        pipeline = Pipeline([
                ("vert", TfidfVectorizer(min_df = 1, binary = False, ngram_range = (1, 1),
                                         tokenizer = Tokenizer())),
                ("svm", LinearSVC(loss='l2', penalty="l1",
                                  dual=False, tol=1e-3)),
                ])

        if self.cv > 0:
            _logger.info("Doing grid search on %d fold CV" % self.cv)
            params = {
                "svm__C": [1, 10, 50, 100, 500, 1000],
                }
            grid = GridSearchCV(pipeline, params, cv=self.cv, verbose=50)
            grid.fit(X, y)
            pipeline = grid.best_estimator_
            _logger.info("Grid search got best score:%f" % grid.best_score_)
            pipeline.accur = grid.best_score_
        else:
            pipeline.fit(X, y)
            _logger.debug("Testing on training data")
            accur = accuracy_score(y, pipeline.predict(X))
            pipeline.accur = accur
            _logger.info("Trainig accuracy (%s - %s): %f" % (p, q, accur))
        self.svms[p,q] = pipeline
        return pipeline
Ejemplo n.º 28
0
    def process_entry(self, entry, source):
        _logger.debug("pool stat: %d working %d waiting" %
                      (self.pool.running(), self.pool.waiting()))
        url = entry.link.encode('utf-8')
        if self.agent.wee_exists(url):
            _logger.debug("ignore existed wee with url:%s" % url)
            return

        _logger.debug("processing entry from (%s)" % url)
        title = entry.title.encode('utf-8')
        if entry.has_key('author'):
            author = entry.author.encode('utf-8')
        else:
            author = ''

        if entry.has_key('updated_parsed') and entry.updated_parsed != None:
            updated_time = int(time.mktime(entry.updated_parsed))
        else:
            updated_time = int(time.time(
            ))  # FeedParser doesn't understand the 'updated' field
            # of this feed, neither can we. Probabaly some CJK chars.
        text, html = self.process_content(entry)
        if entry.has_key('tags'):
            tags = [tag.term.encode('utf-8') for tag in entry.tags]
        else:
            tags = []
        try:
            self.agent.add_wee(source['id'], url, title, text, html,
                               updated_time, author, tags)
        except Exception, err:
            _logger.error("DB failed to add wee: %s" % traceback.format_exc())
Ejemplo n.º 29
0
    def get_all_friend(self, callback=None):
        profile_page = self.selenium.get_location()
        _logger.debug('copy location url: %s' % profile_page)
        _logger.debug('loading attentions page')
        self.selenium.click('id=attentions')
        self._wait_load()

        soup = BeautifulSoup(self.selenium.get_html_source())
        friends = [
            self._create_user_from_attention_list(i)
            for i in soup.findAll('li', 'MIB_linedot_l')
        ]
        while True:
            try:
                self.selenium.click(u'下一页')
            except Exception, err:
                _logger.info('failed to load next page: %s' % err)
                soup = BeautifulSoup(self.selenium.get_html_source())
                for li in soup.findAll('li', 'MIB_linedot_l'):
                    friends.append(self._create_user_from_attention_list(li))
                    if callback != None:
                        callback(li)
Ejemplo n.º 30
0
def clean(X, y, k=10):
    _logger.info("cleaning base on %d-fold cross validation" % k)

    size = len(y)
    kf = KFold(size, n_folds=k, shuffle=True)
    fold = 1
    for train_idx, test_idx in kf:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        pipeline = Pipeline([
                ("vert", TfidfVectorizer(min_df = 1, binary = True, ngram_range = (1, 3),
                                         tokenizer = Tokenizer())),
                ("clf", LinearSVC(loss='l1',
                                  penalty="l2",
                                  multi_class="ovr",
                                  class_weight="auto")),
                ])
        _logger.debug("Training fold %d" % fold)
        pipeline.fit(X_train, y_train)
        _logger.debug("Predicting for fold %d" % fold)
        y_pred = pipeline.predict(X_test)
        _logger.info("fold %d got accuracy: %f" % (fold, accuracy_score(y_test, y_pred)))

        right_f = open("fold%d.right.dat" % fold, "w")
        wrong_f = open("fold%d.wrong.dat" % fold, "w")

        size = len(y_test)
        for i in xrange(size):
            sent, pred, gold = X_test[i].encode('utf-8'), y_pred[i].encode('utf-8'), y_test[i].encode('utf-8')
            if pred != gold:
                wrong_f.write("%s\t%s\t%s\n" % (pred, gold, sent))
            else:
                right_f.write("%s\t%s\n" % (sent, gold))

        right_f.close()
        wrong_f.close()

        fold +=1
Ejemplo n.º 31
0
 def login_sina_weibo(self):
     _logger.debug('logging in to t.cn')
     TEN_MIN = 10 * 60 * 1000
     try:
         _logger.debug('try logging out, just in case')
         self.selenium.click(u'link=退出')
         self._wait_load()
     except Exception, err:
         _logger.debug('clicking loging out link failed')
Ejemplo n.º 32
0
    def crawl(self, callback=None, author_file=None):
        _logger.debug("browser init finished")

        self.author_to_crawl = []

        if author_file == None:
            authors = self.get_all_author()
            # Dump authors to local file
            with open("author_list", 'w') as output:
                output.write("\n".join(authors))
        else:
            with open(author_file, 'r') as author_file_input:
                authors = author_file_input.read().split()
                for author in authors:
                    author = author.strip()
                    if len(author) > 0:
                        self.author_to_crawl.append(author)

        amount = int(
            math.ceil(len(self.author_to_crawl) / float(self.shard_count)))
        start = self.shard_id * amount
        self.author_to_crawl = self.author_to_crawl[start:start + amount]
        _logger.info("crawling %d to %d" % (start, start + amount))
        self.crawl_authors(self.author_to_crawl, callback)
Ejemplo n.º 33
0
def add_to_db(agent, info):
    if info['binary'] == '':
        _logger.error('info binary is empty string, ignore: %s' % info)
        return
    bin = info['binary']
    md5 = hashlib.md5(bin).hexdigest()

    if agent.pic_exists(md5):
        _logger.debug("pic exists in DB, only set popularity")
        agent.update_popularity(md5, info['popularity'])
        return

    dirpath, filepath = get_file_path_by_date(datetime.now())

    if not os.path.exists(dirpath):
        os.mkdir(dirpath)

    filepath = filepath + "." + info['ext']

    with open(filepath, "w") as output:
        output.write(bin)

    agent.add_pic(filepath, info['title'], info['description'], info['author'],
                  info['tags'], md5, info['popularity'])
Ejemplo n.º 34
0
    def pop_tweet_stack(self, email):
        if email != None:
            self.cursor.execute(
                'select tweet_id from tweet_stack where user_email = %s',
                email)
        else:
            self.cursor.execute(
                'select tweet_id from tweet_stack order by tweet_id desc')

        if self.cursor.rowcount == 0:
            _logger.debug('failed to pop tweet stack, it\'s empty, email=%s' %
                          email)
            return None

        if email == None:
            all_rows = list(self.cursor.fetchall())
            random.shuffle(all_rows)
        else:
            all_rows = self.cursor.fetchall()

        for cur_row in all_rows:
            tweet_id = cur_row['tweet_id']
            self.cursor.execute('delete from tweet_stack where tweet_id = %s',
                                tweet_id)
            self.conn.commit()
            _logger.debug('tweet stack popped, it=%d' % tweet_id)

            self.cursor.execute('select * from tweet_crawled where id = %s',
                                tweet_id)
            if self.cursor.rowcount == 0:
                _logger.debug(
                    'failed to find corresponding raw tweet with id = %s' %
                    tweet_id)
                continue
            raw_tweet = self.cursor.fetchone()
            t = Tweet(title=raw_tweet['title'],
                      content=raw_tweet['content'],
                      href=raw_tweet['href'],
                      image_ext=raw_tweet['image_ext'],
                      image_bin=raw_tweet['image_bin'])

            return t

        _logger.debug('all tweet in stack tried, none ID found in DB')
        return None
Ejemplo n.º 35
0
    def crawl_one_author(self, url, callback):
        page = 1
        while True:
            _logger.info("openning page URL: %s" % url)
            self.br.open(url, timeout=TIMEOUT)
            soup = BeautifulSoup(self.br.response().read())
            url = self.br.geturl()

            img_div = soup.findAll('div', 'images')
            imgs = list(
                itertools.chain(
                    *[div.findAll('a', target='_blank') for div in img_div]))
            imgs.extend(soup.findAll('a', {'data-location': 'content'}))
            _logger.debug("%d images on this page" % len(imgs))

            for a in imgs:
                img_url = a['href']

                if img_url in self.crawl_history:
                    _logger.debug('ignoring crawled URL: %s' % img_url)
                    continue

                info = None
                try:
                    all_info = self.grab_image_info(img_url)
                    self.logfile.write(img_url + '\n')
                    self.logfile.flush()
                    _logger.debug('image processed %s' % img_url)
                except Exception, err:
                    _logger.error(
                        'processing one image url failed, url:%s, %s' %
                        (img_url, err))
                else:
                    for info in all_info:
                        try:
                            if callback != None:
                                callback(info=info)
                        except Exception, err:
                            _logger.error(
                                'callback failed, image url: %s, %s, %s' %
                                (img_url, err, traceback.format_exc()))

                _logger.debug('sleeping for 5 sec')
                time.sleep(5)
Ejemplo n.º 36
0
    def transform(self, X, y = None):
        #return self.count_vec.transform(X)
        _logger.debug("Doing tfidf transform")
        Xc = self.count_vec.transform(X)

        X_last = self.collect_last_term(X)
        _logger.debug("Doing last term transform")
        Xl = self.last_vec.transform(X_last)
        _logger.debug("stacking features")
        ret = sparse.hstack([Xc, Xl])
        
        tokens = self.count_vec.build_tokenizer()
        l = list()
        for sent in X:
            terms = tokens(sent)
            l.append(1 if  ("__LOCATION__" in terms and "__ORGNIZATION__" in terms) else 0)

        l = np.array(l)
        l.shape = len(l), 1
        ret = sparse.hstack([ret, l])
        _logger.debug("vectorization transform done")

        return ret
Ejemplo n.º 37
0
    def add_wee_source(self, url, pr=0, tags=[]):
        source_id = self.get_source_id(url)
        if source_id == -1:
            _logger.debug("source(%s) doesn't exist, will insert" % url)
            self.cursor.execute(
                "insert into wee_source(url, pr) values(%s, %s)", (url, pr))
            self.conn.commit()
            source_id = self.get_source_id(url)

        _logger.debug("source(%s) id fetched: %d" % (url, source_id))

        for tag in tags:
            tag = tag.encode('utf-8')
            self.cursor.execute("insert ignore into source_tag values(%s, %s)",
                                (source_id, tag))
            self.conn.commit()
            _logger.debug('tag %s added to source %d' % (tag, source_id))
Ejemplo n.º 38
0
def try_crawl_href(anchor_url, anchor_text, encoding, agent, selenium):
    _logger.debug('crawling anchor (%s), URL: %s' % (anchor_text, anchor_url))
    # filters
    # ignore bad-looking anchors
    if util.chinese_charactor_count(anchor_text.decode('utf-8')) < 10:
        _logger.debug('too few chinese chars in anchor text, ignoring')
        return None

    # ignore same href crawled recently
    if crawled_as_terminal(agent, anchor_url, anchor_text, 30):
        _logger.debug('ignore %s, same href was crawled %d days ago' %
                      (anchor_url, (now - last_crawled_at).days))
        return None

    tweet = crawl_href(anchor_url, anchor_text, encoding, selenium)
    _logger.info('crawl_href finished, anchor-text:(%s)' % anchor_text)
    return tweet
Ejemplo n.º 39
0
 def fit(self, X, y = None):
     _logger.debug("Fitting count vectorizer")
     self.count_vec.fit(X)
     X_last = self.collect_last_term(X)
     self.last_vec.fit(X_last)
     return self