Esempio n. 1
0
 def handle_int(self, signum, frame):
     if os.getpid() != self.root_pid:
         return
     _logger.info('got signal(%d), will shutdown gracefully' % signum)
     self.shutdown()
     _logger.info('all process killed, will call exit(0)')
     sys.exit(0)
Esempio n. 2
0
def test(model, test_file_path):
    total = 0
    correct = 0
    decoder = NaiveDecoder(model)
    outfile = open("predicted.dat", 'w')
    _logger.info("Testing %s" % test_file_path)
    with open(test_file_path) as test_file:
        processed = 1
        for line in test_file:
            line = line.strip().decode('utf-8')
            if not line:
                continue
            total += 1
            sentence, tag = line.split('\t')

            #sentence = extract(sentence)

            result = decoder.decode(sentence)
            predicted, _ = conv.argmax(result.items())
            outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), predicted.encode('utf-8'), tag.encode('utf-8')))
            if predicted == tag:
                correct += 1
            if processed % 1000 == 0:
                _logger.debug("%d lines processed" % processed)
            processed += 1
    outfile.close()
    _logger.info("accuracy: %f" % (float(correct) / total))
Esempio n. 3
0
def recursive_crawl(url, encoding, selenium, agent, domain, terminate):
    if crawled_as_hub(agent, url, day_limit=3):
        _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (url))
        return

    links = pbrowser.get_all_href(url, encoding)
    _logger.debug("processing %d links" % (len(links)))
    count = 0
    for idx, link in enumerate(links):
        # ignore href to different domain; accept all href if 'domain' is empty string
        if urlparse(link['href'].encode('utf-8')).netloc.find(domain) == -1:
            _logger.debug('ignore (%s), different from domain (%s)' %
                          (link['href'].text.encode('utf-8'), domain))
            continue

        tweet = None
        try:
            #tweet = try_crawl_href(link, encoding, agent, selenium)
            tweet = try_crawl_href(link['href'].encode('utf-8').lower(),
                                   link.text.encode('utf-8').strip(), encoding,
                                   agent, selenium)
        except Exception, err:
            _logger.error('crawl href failed: %s, %s' %
                          (err, traceback.format_exc()))
            continue

        if tweet != None:
            count += 1
            try:
                agent.add_crawled_tweet(url, tweet)
                _logger.info(
                    'new tweed added to db, %d total, (%d / %d) prcessed' %
                    (count, idx, len(links)))
            except Exception, err:
                _logger.error('failed to add crawled tweet to DB: %s' % err)
Esempio n. 4
0
 def __init__(self, user=None):
     self.user = user
     self.selenium = selenium('localhost', 4444, 'chrome',
                              'http://www.baidu.com')
     _logger.info('starting selenium')
     self.selenium.start()
     self.selenium.set_timeout(120 * 1000)  # timeout 120 seconds
Esempio n. 5
0
 def _crawl_thirdary(self, anchor):
     self.output.write('    %s\n' % anchor.text.encode('utf-8'))
     _logger.info('crawling fourth (%s)' % anchor['href'])
     try:
         self._crawl_fourth(anchor['href'])
     except Exception, err:
         _logger.error('fourth(%s) failed: %s' % (anchor['href'], err))
Esempio n. 6
0
def main(spec):
    et = ElementTree.parse("task.conf")
    cases = et.findall('case')
    for case in cases:
        query = case.attrib['search']
        search_count = int(case.attrib['search-count'])
        compose_count = int(case.attrib['compose-count'])
        _logger.info('processing case, query=[%s]' % query)
        if spec['crawl']:
            # Start crawler
            _logger.info('kicking off crawler, keyword=(%s), count=%d' %
                         (query, search_count))
            gcrawler.start_crawler(keyword=query, count=search_count)
            # Start interpretor
            _logger.info('kicking off interpreter')
            interpreter.interpret('crawler_out', 'interpret_out')
# Start composer
        if spec['compose']:
            _logger.info('start composing %d articles' % compose_count)
            link_info = {}
            for link in case.findall('link'):
                link_info[link.attrib['anchor']] = list()
                for href in link.findall('href'):
                    link_info[link.attrib['anchor']].append(href.text)
            composer.compose('interpret_out', 'composer_out', compose_count,
                             link_info)


# Start poster
        if spec['post']:
            _logger.info('start posting')
            post_count = int(case.attrib['post-count'])
            poster.post_spam('composer_out', limit=post_count)
Esempio n. 7
0
    def start_spam(self,
                   anchor,
                   href,
                   keyword,
                   count=100,
                   verbose=False,
                   fingerprint="will not be published"):
        """
        Start spamming, use keyword to query Google, request for 'count' results. Anchor text and url are specified with
        'anchor' and 'href'.
        """
        # Make diretory:
        path = "./%s(%s).%d/" % (anchor, keyword, count)
        if verbose:
            if os.path.exists(path):
                _logger.error('%s exists, I\'ll have to remove it, sorry' %
                              path)
                import shutil
                shutil.rmtree(path)
            os.mkdir(path)

        query = keyword + " " + fingerprint
        self.cur_fingerprint = fingerprint[1:-1]
        lazy_result = []
        urls = pbrowser.ask_google(
            query,
            count,
            callback=lambda new_url: lazy_result.append(
                self.process_new_url(new_url, anchor, href)),
            sleep_min=15,
            sleep_max=20,
        )
        _logger.info(
            'ask_google retured %d results, start joinning %s target' %
            (len(urls), len(lazy_result)))
        success_count = 0
        for result in lazy_result:
            try:
                success, info = result.eval()
            except Exception, err:
                _logger.error("failed extracting lazy result:%s" % (err))
            else:
                try:
                    output_path = path + (urlparse(info[0]).hostname +
                                          str(random.randint(1, 1000)))
                except:
                    _logger.error("can't parse hostname from target url:[%s]" %
                                  info[0])
                    output_path = path + "info[0]" + str(
                        random.randint(1, 1000))
            if success:
                success_count += 1
                output_path += '.success.html'
            else:
                output_path += '.fail.html'
            if verbose:
                with open(output_path.encode('utf-8'), 'w') as output:
                    output.write(info[0] + '\n')
                    output.write(info[1])
Esempio n. 8
0
def fill_account(daemon, helper, user):

    sele = daemon.selenium
    daemon.user = user
    _logger.info('start joining groups')
    try:
        daemon.grouping(force=True)
    except Exception, err:
        _logger.error('grouping failed: %s' % err)
Esempio n. 9
0
def main():
    _logger.info("wee indexer started")
    agent = WeeSQLAgent(DB_NAME, DB_USER, DB_PASSWORD)
    agent.start()
    _logger.info("MySQL agent started")
    indexer = Indexer(agent)
    while True:
        #agent.restart()
        indexer.index_new_wee()
        _logger.debug("Sleep for %d sec" % SLEEP_SEC)
        time.sleep(SLEEP_SEC)
Esempio n. 10
0
    def process_terminal(self, task):
        anchor_text = task['anchor_text']
        anchor_url = task['anchor_url']
        _logger.info('processing terminal link, url:%s' % anchor_url)

        tweet = None
        try:
            tweet = try_crawl_href(anchor_url, anchor_text, task['encoding'],
                                   self.agent, self.sele)
        except Exception, err:
            _logger.error('crawl href failed: %s, %s' %
                          (err, traceback.format_exc()))
Esempio n. 11
0
 def shutdown(self):
     self.agent.stop()
     if hasattr(self, 'workers'):
         for worker in self.workers:
             pid = worker.pid
             try:
                 self.kill_worker(worker)
                 _logger.info('child process %d killed' % pid)
             except Exception, err:
                 _logger.error(
                     'failed to kill child pid:%d, %s, it will become orphan'
                     % (pid, err))
Esempio n. 12
0
    def crawl(self, url):
        self.output = open('hao123.crawl%s' % datetime.now().date(), 'w')
        _logger.info('opening hao123 home page: %s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"))

        for top_tier in soup.findAll('table', monkey='cool'):
            anchor = top_tier.find('a')
            _logger.info('crawling top tier category: %s (%s)'
                         % (anchor.text, anchor['href']))
            self.crawl_first(pbrowser.abs_url(url, anchor['href']))
        self.output.close()
Esempio n. 13
0
 def load_data(self):
     _logger.info("Loading training data from %s" % self.train_path)
     self.X = []
     self.y = []
     with open(self.train_path) as train_file:
         for line in train_file:
             line = line.strip().decode('utf-8')
             if not line:
                 continue
             terms, domain = line.split('\t')
             self.X.append(terms)
             self.y.append(domain)
Esempio n. 14
0
def compose(infile, outfile, count, link_info, min_word = 200, max_word = 500):
    infile = open(infile)
    paragraphs = _parse_paragraphs(infile)
    infile.close()
    articles = []
    link_dropper = _LinkDropper(link_info)
    for i in range(0, count):
        _logger.info('writing article ' + str(i) + ' of ' + str(count))
        articles.append(link_dropper._drop_links(write_one_article(paragraphs, min_word, max_word)) + '\n\n')
    outfile = open(outfile, 'w')
    outfile.write(('\n\n' + doc_sep + '\n\n').join(articles))
    outfile.close()
Esempio n. 15
0
    def process_hub(self, task):
        url = task['anchor_url']
        _logger.info('processing hub page, url:%s' % url)
        last_crawl = self.agent.get_crawl_history(url)
        now = datetime.now()
        if (now - last_crawl).days <= 3:
            _logger.debug('ignore, recently crawled: %s' % str(last_crawl))
            return

        domain = task['domain']
        encoding = task['encoding']
        links = pbrowser.get_all_href(url, encoding)
        _logger.debug("got %d links" % (len(links)))

        for idx, link in enumerate(links):
            if urlparse(
                    link['href'].encode('utf-8')).netloc.find(domain) == -1:
                _logger.debug('ignore (%s), different from domain (%s)' %
                              (link['href'].encode('utf-8'), domain))
                continue

            # make tempoary source
            cur_url = link['href'].encode('utf-8').lower()
            cur_text = link.text.encode('utf-8').strip()

            if crawled_as_hub(self.agent, cur_url, day_limit=3):
                _logger.debug('ignore, recently(3 days) crawled as hub: %s' %
                              (cur_url))
                continue

            if crawled_as_terminal(self.agent, cur_url, cur_text,
                                   day_limit=30):
                _logger.debug(
                    'ignore, recently(3 days) crawled as terminal: %s' %
                    (cur_url))
                continue

            if in_task_queue(self.agent, cur_url, cur_text):
                _logger.debug('ignore, already added to task queue: %s' %
                              (cur_url))
                continue

            ttl = task['ttl'] - 1
            try:
                self.agent.add_crawler_task(anchor_url=cur_url,
                                            anchor_text=cur_text,
                                            encoding=encoding,
                                            domain=domain,
                                            ttl=ttl)
                _logger.debug('%s added to task in DB' % cur_url)
            except Exception, err:
                _logger.error('failed to add crawler task, url:(%s), %s' %
                              (cur_url, err))
Esempio n. 16
0
def load_data(train_path):
    _logger.info("Loading data from %s" % train_path)
    X = []
    y = []
    with open(train_path) as train_file:
        for line in train_file:
            line = line.strip().decode("utf-8")
            if not line:
                continue
            terms, domain = line.split("\t")
            X.append(terms)
            y.append(domain)
    return np.array(X), np.array(y)
Esempio n. 17
0
 def _crawl_secondary(self, div):
     tb = div
     self.output.write('  %s\n' % div.text.encode('utf-8'))
     while not hasattr(tb, 'name') or tb.name != u"table":
         tb = tb.nextSibling
     for third in tb.findAll('a'):
         _logger.info('crawling thirdary (%s)' % third.text)
         try:
             self._crawl_thirdary(third)
         except Exception, err:
             _logger.error(
                 'third(%s) failed: %s\n%s' %
                 (third.text.encode('utf-8'), err, traceback.format_exc()))
Esempio n. 18
0
 def crawl_authors(self, authors, callback):
     for author in authors:
         cur_url = author
         _logger.info("crawling author from %s" % cur_url)
         try:
             self.crawl_one_author(cur_url, callback)
             _logger.debug('sleeping for 5 sec')
             time.sleep(5)
         except Exception, err:
             _logger.error(
                 "crawl one author failed, url:(%s), error:%s, %s" %
                 (cur_url, err, traceback.format_exc()))
             continue
Esempio n. 19
0
def check_proxies(agent):
    config = agent.get_core_config()
    PROXY_TRYOUT_COUNT = int(config['proxy_tryout_count'])
    VALID_PROXY_FAIL_RATE = float(config['valid_proxy_fail_rate'])

    all_proxy = agent.get_all_proxy()
    account_num = agent.get_all_user_count()
    slot_num = math.ceil(account_num / 50.0)
    slot_num = int(slot_num)
    _logger.info(
        "%d account, %d proxy slots, fail rate limit: %.2f%%, try out: %d" %
        (account_num, slot_num, VALID_PROXY_FAIL_RATE * 100,
         PROXY_TRYOUT_COUNT))

    for slot_id in range(slot_num):
        proxy = agent.get_proxy_by_slot(slot_id)
        if proxy == None:
            _logger.info("proxy slot #%d is empty, try picking proxy for it" %
                         slot_id)
            pick_proxy_for_slot(agent, slot_id, all_proxy)
        elif bad_proxy(proxy):
            _logger.info(
                "proxy slot #%d is bad with addr: %s, will pick new one" %
                (slot_id, proxy['addr']))
            agent.remove_proxy_from_slot(proxy)
            pick_proxy_for_slot(agent, slot_id, all_proxy)
        else:
            _logger.info("proxy slot #%d OK, addr: %s" %
                         (slot_id, proxy['addr']))
Esempio n. 20
0
def vectorize(tfidf=False,binary=False):
    
    _logger.info("Loding...")
    
    trainX = [r[0] for r in tsv.reader(conv.redirect('train.tokenized.dat'))]
    testX = [r[0] for r in tsv.reader(conv.redirect('test.tokenized.dat'))]
    
    vectorizer = None
    if tfidf:
        vectorizer = TfidfVectorizer
    else:
        vectorizer = CountVectorizer
    
    _logger.info("Fitting and transforming...")
    vectorizer = vectorizer(token_pattern=u'(?u)\\b\\w+\\b',binary=binary, ngram_range = (1, 3))
    trainX = vectorizer.fit_transform(trainX)
    testX = vectorizer.transform(testX)
    
    _logger.info("Dumping binaries...")
    pickle.dump(vectorizer,open("vectorizer.bin",'w'))
    pickle.dump(trainX,open("train.vectorized.mat",'w'))
    pickle.dump(testX,open("test.vectorized.mat",'w'))
    
    schema = vectorizer.get_feature_names()
    codecs.open("schema.dat",'w',encoding='utf-8').write('\n'.join(schema))

    # debug
#    _logger.info("Dumping inversered...")
#    codecs.open("test.vectorized.dat",'w',encoding='utf-8').write( '\n'.join( [(' '.join(i)) for i in vectorizer.inverse_transform(testX)] ) )
#    codecs.open("train.vectorized.dat",'w',encoding='utf-8').write( '\n'.join( [(' '.join(i)) for i in vectorizer.inverse_transform(trainX)] ) )

    trainX = trainX.tocoo(False)
    testX = testX.tocoo(False)
    
    _logger.info("Dumping test.vectorized.dat...")
    with codecs.open("test.vectorized.dat",'w',encoding='utf-8') as fl:
        dc = defaultdict(list)
        for r,c,v in zip(testX.row,testX.col,testX.data):
            dc[r].append( "%s(%s)=%s"%(schema[c],c,v) )
        for i in sorted(dc.keys()):
            fl.write("%s\t%s\n" % (i, " , ".join(list(dc[i])) ))
    
    
    _logger.info("Dumping train.vectorized.dat...")
    with codecs.open("train.vectorized.dat",'w',encoding='utf-8') as fl:
        dc = defaultdict(list)
        for r,c,v in zip(trainX.row,trainX.col,trainX.data):
            dc[r].append( "%s(%s)=%s"%(schema[c],c,v) )
        for i in sorted(dc.keys()):
            fl.write("%s\t%s\n" % (i, " , ".join(list(dc[i])) ))
Esempio n. 21
0
    def fetch_source(self, source):
        _logger.debug("pool stat: %d working %d waiting" %
                      (self.pool.running(), self.pool.waiting()))
        _logger.info("crawling source id=%d url=%s" %
                     (source['id'], source['url']))

        cur_time = int(time.time())
        last_crawl_time = source['last_crawl_time']
        if cur_time - last_crawl_time < HOUR:
            _logger.info("ignore source(%s), last crawled %d minutes ago" %
                         (source['url'], (cur_time - last_crawl_time) / 60))
            return

        try:
            _logger.debug("fetching feed from (%s)" % source['url'])
            p = feedparser.parse(source['url'])
            _logger.debug("fetched from (%s)" % source['url'])
            if p.feed.has_key(
                    'updated_parsed') and p.feed.updated_parsed != None:
                cur_feed_time = int(time.mktime(p.feed.updated_parsed))
            else:
                cur_feed_time = int(time.time(
                ))  # FeedParser doesn't understand the 'updated' field
                # of this feed, neither can we. Probabaly some CJK chars.
            db_feed_time = source['last_feed_time']
            if db_feed_time >= cur_feed_time:
                _logger.info(
                    "ignore source(%s), no new feed. Last feed:%s, cur feed:%s"
                    % (source['url'], datetime.fromtimestamp(db_feed_time),
                       datetime.fromtimestamp(cur_feed_time)))
                self.agent.update_source_time(source)
            else:
                _logger.info("processing %d entries from %s" %
                             (len(p.entries), source['url']))
                for entry in p.entries:
                    self.process_entry(entry, source)
                self.agent.update_source_time(source, cur_feed_time)
                _logger.debug(
                    "source(%s) updated: %s" %
                    (source['url'], datetime.fromtimestamp(cur_feed_time)))

            _logger.info("source(id=%d) success" % source['id'])
            _logger.debug("pool stat: %d working %d waiting" %
                          (self.pool.running(), self.pool.waiting()))
        except Exception, err:
            _logger.error(
                "crawling faild for source id=%d, %s: %s" %
                (source['id'], source['url'], traceback.format_exc()))
Esempio n. 22
0
    def user_timeline(self, count=10):
        # Assume logged in
        self.selenium.click('id=mblog')
        self._wait_load()
        soup = BeautifulSoup(self.selenium.get_html_source())
        tweet = [i.text for i in soup.findAll('p', 'sms')]

        while len(tweet) < count:
            try:
                self.selenium.click(u'下一页')
                self._wait_load()
            except Exception, err:
                _logger.info('failed to load next page: %s', err)
                break
            soup = BeautifulSoup(self.selenium.get_html_source())
            tweet.extend([i.text for i in soup.findAll('p', 'sms')])
Esempio n. 23
0
class SQLAgent(object):
    __metaclass__ = MetaAgent
    # set sscursor to True if want to store the result set in server. It's for large result set
    def __init__(self, db_name, db_user, db_pass, host = "localhost", sscursor = False):
        self.db_name = db_name
        self.db_user = db_user
        self.db_pass = db_pass
        self.db_host = host
        self.use_sscursor = sscursor

    def start(self):
        _logger.info('connecting DB... host:%s %s@%s:%s' % (self.db_host, self.db_user, self.db_name, self.db_pass))
        self.conn = MySQLdb.connect(host = self.db_host,
                                    user = self.db_user,
                                    passwd = self.db_pass,
                                    db = self.db_name,
                                    )
        if self.use_sscursor: # store result in server
            self.cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor)
        else:
            self.cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)

        self.cursor.old_execute = self.cursor.execute
        self.cursor.execute = self.safe_execute

        self.cursor.execute('set names utf8')
        self.conn.commit()

    def stop(self):
        try:
            self.cursor.close()
            self.conn.close()
        except Exception, err:
            _logger.error('stopping SQLAgent failed: %s, will continue anyway' % err)
        _logger.info('sql agent stopped')
Esempio n. 24
0
    def start(self):
        _logger.info('connecting DB... host:%s %s@%s:%s' %
                     (self.db_host, self.db_user, self.db_name, self.db_pass))
        self.conn = MySQLdb.connect(
            host=self.db_host,
            user=self.db_user,
            passwd=self.db_pass,
            db=self.db_name,
        )
        if self.use_sscursor:  # store result in server
            self.cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor)
        else:
            self.cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)

        self.cursor.execute('set names utf8')
        self.conn.commit()
Esempio n. 25
0
    def crawl(self, url):
        self.owned = set()
        self.output = open('265.crawl%s' % datetime.now().date(), 'w')
        _logger.info('opening 265 home page: %s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"),
                             fromEncoding='utf-8')

        for anchor in soup.find('div',
                                id="siteCate").find('div',
                                                    'body').findAll('a'):
            _logger.info('crawling top tier category: %s (%s)' %
                         (anchor.text, anchor['href']))
            self.output.write('%s\n' % anchor.text.encode('utf8'))
            self.crawl_layer(pbrowser.abs_url(url, anchor['href']), 1)
        self.output.close()
Esempio n. 26
0
    def _crawl_primary(self, anchor):
        self.output.write(anchor.text.encode('utf-8') + '\n')
        self._randsleep()
        html = self.br.open(anchor['href']).read()
        html = util.convert_to_utf8(html, 'gb2312')
        soup = BeautifulSoup(html)

        seconds = soup.findAll('div', 'dirtit')
        for second in seconds:
            _logger.info('crawling secondary category: (%s)' %
                         second.text.encode('utf-8'))
            try:
                self._crawl_secondary(second)
            except Exception, err:
                _logger.error('secondary(%s) failed: %s' %
                              (second.text.encode('utf-8'), err))
Esempio n. 27
0
def interpret(inpath, outpath):
    with open(inpath, "rb") as crawled_docs:
        docs = cPickle.load(crawled_docs)
    _logger.info('found ' + str(len(docs)) + ' docs from crawler\'s output')

    output_str = u''
    for doc in docs:
        _logger.info('processing doc from url: ' + doc['url'])
        contents = parse_html(doc)
        output_str += unicode(doc['url'] + '\n\n' + '+' * 100 + '\n\n')
        for paragraph in contents:
            if is_valid_text(paragraph):
                output_str += unicode(paragraph + '\n\n' + '+' * 100 + '\n\n')
        output_str += ('\n\n' + '=' * 100 + '\n')
    with open(outpath, "w") as output:
        output.write(output_str.encode('utf-8'))
Esempio n. 28
0
    def crawl(self, url):
        self.output = open('baike.crawl%s' % datetime.now().date(), 'w')
        _logger.info('opening baike home page: %s' % url)
        html = self.br.open(url).read()
        html = util.convert_to_utf8(html, 'gb2312')
        soup = BeautifulSoup(html)

        for item in soup.find('div', id="classList").findAll('h2'):
            anchor = item.find('a')
            _logger.info(
                'crawling primary category: (%s), %s' %
                (anchor.text.encode('utf-8'), anchor['href'].encode('utf-8')))
            try:
                self._crawl_primary(anchor)
            except Exception, err:
                _logger.error('primary category(%s) failed: %s' %
                              (anchor.text.encode('utf-8'), err))
Esempio n. 29
0
def try_crawl_href(anchor_url, anchor_text, encoding, agent, selenium):
    _logger.debug('crawling anchor (%s), URL: %s' % (anchor_text, anchor_url))
    # filters
    # ignore bad-looking anchors
    if util.chinese_charactor_count(anchor_text.decode('utf-8')) < 10:
        _logger.debug('too few chinese chars in anchor text, ignoring')
        return None

    # ignore same href crawled recently
    if crawled_as_terminal(agent, anchor_url, anchor_text, 30):
        _logger.debug('ignore %s, same href was crawled %d days ago' %
                      (anchor_url, (now - last_crawled_at).days))
        return None

    tweet = crawl_href(anchor_url, anchor_text, encoding, selenium)
    _logger.info('crawl_href finished, anchor-text:(%s)' % anchor_text)
    return tweet
Esempio n. 30
0
    def crawl_one_author(self, url, callback):
        page = 1
        while True:
            _logger.info("openning page URL: %s" % url)
            self.br.open(url, timeout=TIMEOUT)
            soup = BeautifulSoup(self.br.response().read())
            url = self.br.geturl()

            img_div = soup.findAll('div', 'images')
            imgs = list(
                itertools.chain(
                    *[div.findAll('a', target='_blank') for div in img_div]))
            imgs.extend(soup.findAll('a', {'data-location': 'content'}))
            _logger.debug("%d images on this page" % len(imgs))

            for a in imgs:
                img_url = a['href']

                if img_url in self.crawl_history:
                    _logger.debug('ignoring crawled URL: %s' % img_url)
                    continue

                info = None
                try:
                    all_info = self.grab_image_info(img_url)
                    self.logfile.write(img_url + '\n')
                    self.logfile.flush()
                    _logger.debug('image processed %s' % img_url)
                except Exception, err:
                    _logger.error(
                        'processing one image url failed, url:%s, %s' %
                        (img_url, err))
                else:
                    for info in all_info:
                        try:
                            if callback != None:
                                callback(info=info)
                        except Exception, err:
                            _logger.error(
                                'callback failed, image url: %s, %s, %s' %
                                (img_url, err, traceback.format_exc()))

                _logger.debug('sleeping for 5 sec')
                time.sleep(5)
Esempio n. 31
0
def test(X, y):
    by_domain = defaultdict(list)
    sz = len(y)
    for i in xrange(sz):
        by_domain[y[i]].append(X[i])

    domains = ['alarm', 'calendar', 'communication', 'note', 'places',
               'reminder', 'weather', 'web']
    for p in domains:
        for q in domains:
            if p < q:
                clf = svms[p, q]
                p_len = len(by_domain[p])
                q_len = len(by_domain[q])
                X = list(by_domain[p])
                X.extend(by_domain[q])
                y = [p] * p_len
                y.extend([q] * q_len)
                _logger.info("%.4f, %s - %s" % (clf.score(X, y), p, q))
Esempio n. 32
0
def main():
    listen()
    socket.setdefaulttimeout(120)
    agent = WeeSQLAgent('weDaily', 'junyi', 'admin123')
    agent.start()
    pool = eventlet.GreenPool(2000)
    crawler = FeedCrawler(agent, pool)
    loop_count = 1
    while True:
        agent.restart()
        sources = agent.get_all_sources()
        for source in sources:
            pool.spawn_n(crawler.fetch_source, source)
        pool.waitall()

        _logger.info("loop %d finished, will sleep for %d seconds" %
                     (loop_count, SLEEP_IN_SEC))
        loop_count += 1
        time.sleep(SLEEP_IN_SEC)
Esempio n. 33
0
    def cv(self, fold):
        size = len(self.y)
        kf = cross_validation.KFold(size, fold, shuffle=True)
        iteration = 0
        scores = list()
        for train_idx, test_idx in kf:
            X = [self.X[idx] for idx in train_idx]
            y = [self.y[idx] for idx in train_idx]
            X_test = [self.X[idx] for idx in test_idx]
            y_test = [self.y[idx] for idx in test_idx]
            _logger.debug("Training...")
            self.fit(X, y)
            _logger.debug("Testing...")
            score = self.get_test_accuracy(X_test, y_test)
            scores.append(score)
            iteration += 1
            _logger.info("CV iteration %d: CV accuracy: %f" % \
                             (iteration, score))

        scores = np.array(scores)
        return scores.mean(), scores.std()
Esempio n. 34
0
    def train(self):
        _logger.info("reading posterior probabilities from naive bayes model")
        self.words = list()
        self.words_seen = set()
        X = np.array([])
        for term in g_term_count:
            term = term_category(term)
            if term in self.words_seen:
                continue
            self.words_seen.add(term)
            self.words.append(term)
            x = list()
            for domain in self.naive.model.domains:
                val = self.naive.posterior_prob(term, domain)
                x.append(val)
            X = np.append(X, x)
        _logger.info("%d terms need to be clustered" % len(self.words))

        X = np.reshape(X, (len(self.words), len(self.naive.model.domains)))
        kmeans = KMeans(n_clusters = len(self.words) / 10)
        y = kmeans.fit_predict(X)

        with open(OUTFILE_PATH, "w") as outfile:
            for i in xrange(len(y)):
                outfile.write("%s\t%d\n" % (self.words[i].encode('utf-8'), y[i]))
        _logger.info("clustering result wrote to %s" % OUTFILE_PATH)
Esempio n. 35
0
def fill_all_accounts(daemon, helper):
    users = daemon.agent.get_all_user()
    good_ids = open('good', 'w')
    bad_ids = open('bad', 'w')
    for user in users:
        try:
            api = daemon.get_api_by_user(user.uname)
            good_ids.write(
                '%s:%s:%s\n' %
                (user.uname, user.passwd, api.me().name.encode('utf-8')))
            _logger.info('successful fetching api failed for %s' %
                         (user.uname))
        except Exception, err:
            _logger.error('failed fetching api failed for %s: %s' %
                          (user.uname, err))
            bad_ids.write('%s:%s\n' % (user.uname, user.passwd))
            # try:
            #     fill_account(daemon, helper, user)
            # except Exception, err:
            #     _logger.error('fill_account failed for (%s): %s' %
            #                   (user.uname, err))
        time.sleep(1)
Esempio n. 36
0
    def get_all_friend(self, callback=None):
        profile_page = self.selenium.get_location()
        _logger.debug('copy location url: %s' % profile_page)
        _logger.debug('loading attentions page')
        self.selenium.click('id=attentions')
        self._wait_load()

        soup = BeautifulSoup(self.selenium.get_html_source())
        friends = [
            self._create_user_from_attention_list(i)
            for i in soup.findAll('li', 'MIB_linedot_l')
        ]
        while True:
            try:
                self.selenium.click(u'下一页')
            except Exception, err:
                _logger.info('failed to load next page: %s' % err)
                soup = BeautifulSoup(self.selenium.get_html_source())
                for li in soup.findAll('li', 'MIB_linedot_l'):
                    friends.append(self._create_user_from_attention_list(li))
                    if callback != None:
                        callback(li)
Esempio n. 37
0
def clean(X, y, k=10):
    _logger.info("cleaning base on %d-fold cross validation" % k)

    size = len(y)
    kf = KFold(size, n_folds=k, shuffle=True)
    fold = 1
    for train_idx, test_idx in kf:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        pipeline = Pipeline([
                ("vert", TfidfVectorizer(min_df = 1, binary = True, ngram_range = (1, 3),
                                         tokenizer = Tokenizer())),
                ("clf", LinearSVC(loss='l1',
                                  penalty="l2",
                                  multi_class="ovr",
                                  class_weight="auto")),
                ])
        _logger.debug("Training fold %d" % fold)
        pipeline.fit(X_train, y_train)
        _logger.debug("Predicting for fold %d" % fold)
        y_pred = pipeline.predict(X_test)
        _logger.info("fold %d got accuracy: %f" % (fold, accuracy_score(y_test, y_pred)))

        right_f = open("fold%d.right.dat" % fold, "w")
        wrong_f = open("fold%d.wrong.dat" % fold, "w")

        size = len(y_test)
        for i in xrange(size):
            sent, pred, gold = X_test[i].encode('utf-8'), y_pred[i].encode('utf-8'), y_test[i].encode('utf-8')
            if pred != gold:
                wrong_f.write("%s\t%s\t%s\n" % (pred, gold, sent))
            else:
                right_f.write("%s\t%s\n" % (sent, gold))

        right_f.close()
        wrong_f.close()

        fold +=1
Esempio n. 38
0
def test(test_file_path, clf):
    X, y = load_data(test_file_path)
    size = len(y)

    scores = clf.decision_function(X)
    # y_pred = []
    # for i in xrange(size):
    #     score = scores[i]
    #     detail = sorted(zip(clf.named_steps['clf'].classes_,
    #                         score),
    #                     key = lambda x: -x[1])
    #     if detail[0][1] >= 1.1:
    #         y_pred.append(detail[0][0])
    #     else:
    #         y_pred.append(u'web')

    y_pred = clf.predict(X)
    outfile = open("predicted.dat", 'w')
    for i in range(len(y)):
        sentence, pred, gold = X[i], y_pred[i], y[i]
        outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8')))
    _logger.info("accuracy: %f, %d records" % (accuracy_score(y, y_pred),
                                               len(y)))
Esempio n. 39
0
    def crawl_layer(self, url, level):
        self._randsleep()

        prefix = '  ' * level
        _logger.info('opening layer url: %s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"),
                             fromEncoding="utf-8")
        _logger.info('processing page with title:%s' % soup.title.text)

        # get next level links
        children = {}
        for li in soup.find('div', id='TreeData').findAll('li', 'close'):
            a = li.find('a')
            children[a.text] = a['href']

        # grab links in current page
        for div in soup.find('div', id="BMain").findAll('div', 'subBM'):
            cate = div.find('h3').text
            if cate in self.owned:
                continue

            self.owned.add(cate)
            self.output.write(prefix + '%s\n' % cate.encode('utf8'))
            for li in div.find('ul', 'listUrl').findAll('li'):
                try:
                    a = li.find('a')
                    self.output.write(
                        prefix * 2 + '%s %s\n' %
                        (a['href'].encode('utf8'), a.text.encode('utf8')))
                except Exception, err:
                    _logger.error('error processing anchor(%s): %s' %
                                  (str(li), err))

            # grab links in next level, if any
            if cate in children:
                self.crawl_layer(children[cate], level + 1)
Esempio n. 40
0
def test(X, y):
    _logger.info("Fisrt stage accuracy: %f" % front.score(X, y))
    import decode_svm
    outfile = open("predicted.dat", "w")
    discfile = open("discriminated.dat", "w")
    y_pred = list()
    sz = len(y)
    domains = front.named_steps["clf"].classes_
    for i in xrange(sz):
        sent = X[i]
        gold = y[i]
        
        front_result = sorted(zip(domains, front.decision_function([sent])[0]),
                              key = lambda x: -x[1])
        
        pred = front_result[0][0]
        assert pred == front.predict([sent])[0]

        if front_result[0][1]  < 0.0 or front_result[1][1] > 0.0:
            p = front_result[0][0]
            q = front_result[1][0]
            svm_pred = decode_svm.discriminate(p, q, sent)[0]
            discfile.write("%s\t%s\t%s\t%s\t%s\n" % \
                               (sent.encode('utf-8'),
                                p.encode('utf-8'),
                                q.encode('utf-8'),
                                svm_pred.encode('utf-8'), gold.encode('utf-8')))
            pred = svm_pred

        y_pred.append(pred)

        outfile.write("%s\t%s\t%s\n" % (sent.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8')))

    _logger.info("ensembled accuracy: %f" % accuracy_score(y, y_pred))

    outfile.close()
    discfile.close()
Esempio n. 41
0
    def train_pair(self, p, q):
        if p > q:
            p, q = q, p

        p_len = len(self.by_domain_data[p])
        q_len = len(self.by_domain_data[q])

        _logger.info("Training SVM for %s V.S. %s, %d + %d = %d recored" % \
                         (p, q, p_len, q_len, p_len + q_len))

        X = list(self.by_domain_data[p])
        X.extend(self.by_domain_data[q])
        y = [p] * p_len
        y.extend([q] * q_len)

        pipeline = Pipeline([
                ("vert", TfidfVectorizer(min_df = 1, binary = False, ngram_range = (1, 1),
                                         tokenizer = Tokenizer())),
                ("svm", LinearSVC(loss='l2', penalty="l1",
                                  dual=False, tol=1e-3)),
                ])

        if self.cv > 0:
            _logger.info("Doing grid search on %d fold CV" % self.cv)
            params = {
                "svm__C": [1, 10, 50, 100, 500, 1000],
                }
            grid = GridSearchCV(pipeline, params, cv=self.cv, verbose=50)
            grid.fit(X, y)
            pipeline = grid.best_estimator_
            _logger.info("Grid search got best score:%f" % grid.best_score_)
            pipeline.accur = grid.best_score_
        else:
            pipeline.fit(X, y)
            _logger.debug("Testing on training data")
            accur = accuracy_score(y, pipeline.predict(X))
            pipeline.accur = accur
            _logger.info("Trainig accuracy (%s - %s): %f" % (p, q, accur))
        self.svms[p,q] = pipeline
        return pipeline
Esempio n. 42
0
        ])

params = {
    "nb__alpha": [0.001, 0.01, 0.1, 0.5],
    }

if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--input", help="path of the training data", default=TRAIN_FILE_PATH)
    cmd.add_argument("--cv", help="enable cross validation", type=int, default=0)
    args = cmd.parse_args()

    X, y = load_data(args.input)

    if args.cv > 0:
        _logger.info("Doing %d fold cross validation" % args.cv)
        gs = GridSearchCV(pipeline, params, cv = args.cv, verbose=5)
        gs.fit(X, y)

        with open("sk_naive.model", "w") as outfile:
            pickle.dump(gs.best_estimator_, outfile)
            _logger.info("Model dumped to sk_naive.model")        
        print gs.best_estimator_
        print gs.best_score_
    else:
        _logger.info("Start training")
        pipeline.fit(X, y)
        with open("sk_naive.model", "w") as outfile:
            pickle.dump(pipeline, outfile)
            _logger.info("Model dumped to sk_naive.model")
Esempio n. 43
0
 def __init__(self, naive_model_path):
     _logger.info("loading naive bayes model from %s" % naive_model_path)
     model = pickle.load(open(naive_model_path))
     self.naive = NaiveDecoder(model)
     self.words = dict()
Esempio n. 44
0
    domains = ['alarm', 'calendar', 'communication', 'note', 'places',
               'reminder', 'weather', 'web']
    for p in domains:
        for q in domains:
            if p < q:
                clf = svms[p, q]
                p_len = len(by_domain[p])
                q_len = len(by_domain[q])
                X = list(by_domain[p])
                X.extend(by_domain[q])
                y = [p] * p_len
                y.extend([q] * q_len)
                _logger.info("%.4f, %s - %s" % (clf.score(X, y), p, q))


_logger.info("loading model from svms.model")
svms = pickle.load(open('svms.model'))
        
if __name__ == "__main__":

    cmd = argparse.ArgumentParser()
    cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH)
    cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true')
    args = cmd.parse_args()
    X, y = load_data(args.path)

    if args.as_server:
        serv()
    else:
        test(X, y)
Esempio n. 45
0
    #     else:
    #         y_pred.append(u'web')

    y_pred = clf.predict(X)
    outfile = open("predicted.dat", 'w')
    for i in range(len(y)):
        sentence, pred, gold = X[i], y_pred[i], y[i]
        outfile.write("%s\t%s\t%s\n" % (sentence.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8')))
    _logger.info("accuracy: %f, %d records" % (accuracy_score(y, y_pred),
                                               len(y)))


if __name__ == "__main__":

    cmd = argparse.ArgumentParser()
    cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH)
    cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true')
    cmd.add_argument("--model", help = "path to the pickled model", required=True,
                     choices = ["%s.model" % algo for algo in CLFs.keys()])
    args = cmd.parse_args()

    _logger.info("loading model from %s" % args.model)
    clf = pickle.load(open(args.model))

    if args.as_server:
        serv(clf)
        sys.exit(0)

    test(args.path, clf)

Esempio n. 46
0
                self.terms.add(term)
                self.domains.add(domain)

        v = len(self.terms)
        for term in self.terms:
            p = dict()
            for domain in self.domains:
                p[domain] = (1.0 + self.count[term, domain]) / (v + self.count[domain])
            wcp = dict()
            s = sum(p.values())
            for domain in self.domains:
                wcp[domain] = p[domain] / s
            self.gini[term] = sum([v ** 2 for v in wcp.values()])

    def dump(self, out_path):
        with open(out_path, 'w') as outfile:
            for k, v in self.gini.items():
                outfile.write("%s %f\n" % (k, v))

if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--input", help="path of the count data")
    cmd.add_argument("--output", help="path to dump the model", default=MODEL_PATH)
    args = cmd.parse_args()

    gini = GiniCoe(args.input)
    _logger.info("Training Gini coefficient from count file: %s" % args.input)
    gini.train()
    _logger.info("Dumping model to %s" % args.output)
    gini.dump(args.output)
Esempio n. 47
0
from util import *
from util.log import _logger
from model.naive.train import NaiveBayes
from feat.terms.term_categorize import term_category, g_term_count
import rep.word_clustering.decode as word_clustering

class ClusteredNaiveBayes(NaiveBayes):
    def get_category(self, term):
        term = term_category(term)
        return word_clustering.get_cluster(term)


if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--input", help="path of the training data")
    cmd.add_argument("--terms", help="path of the terms file")
    cmd.add_argument("--alpha", help="alpha of discounting", type=float, default=0.5)
    cmd.add_argument("--cv", help="enable cross validation", type=int, default=0)

    args = cmd.parse_args()

    naive = ClusteredNaiveBayes(args.input, args.terms, args.alpha)
    if args.cv > 0:
        _logger.info("CV accuracy: %f +/- %f" % naive.cv(args.cv))
    else:
        _logger.info("Start training");
        naive.train()
        with open("naive.clustered.model", "w") as outfile:
            pickle.dump(naive, outfile)
            _logger.info("Model dumped to naive.clustered.model")
Esempio n. 48
0
    vert = clf.named_steps['vert']
    terms = list(set(sentence.split()))
    terms = sorted([(term, sel.scores_[get_vert_idx(vert, term_category(term))]) for term in terms], 
                   key = lambda x: -x[1])[:7]
    return ' '.join([term[0] for term in terms])


def extract(X, clf):
    ret = []
    for sentence in X:
        ret.append(slim(sentence, clf))
    return ret
            

if __name__ == "__main__":
    _logger.info("loading model")
    clf = pickle.load(open('sk_naive.model'))
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH)
    cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true')
    args = cmd.parse_args()

    if args.as_server:
        serv(clf)

    X, y = load_data(args.path)

    # _logger.debug("Extracting merites for long sentences")
    # X = extract(X, clf)
    
    y_pred = clf.predict(X)
Esempio n. 49
0
        l = np.array(l)
        l.shape = len(l), 1
        ret = sparse.hstack([ret, l])
        _logger.debug("vectorization transform done")

        return ret


if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--input", help="path of the training data", default = TRAIN_FILE_PATH)
    cmd.add_argument("--algo", help="alogrithm to use", required=True, choices = CLFs.keys())
    args = cmd.parse_args()

    X, y = load_data(args.input)
    _logger.info("training using %s" % args.algo)

    pipeline = Pipeline([
            ("vert", TfidfVectorizer(min_df = 1, binary = True, ngram_range = (1, 3),
                                     tokenizer = Tokenizer())),
            #("vert", Vectorizer()),
            ("clf", CLFs[args.algo]),
            ])

    pipeline.fit(X, y)
    from decode import test
    test(TEST_FILE_PATH, pipeline)

    outpath = "%s.model" % args.algo
    with open(outpath, "w") as outfile:
        pickle.dump(pipeline, outfile)
Esempio n. 50
0
if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
<<<<<<< HEAD
    cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true')
    cmd.add_argument("--serv-prob", help = "run as server compare posterior probability of terms under every domain", dest="as_server_prob", action='store_true')
    cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH)
    cmd.add_argument("--model-path", help = "path to the naive bayes model file")
=======
    cmd.add_argument("--serv", help = "run as server", default=False, dest="as_server", action='store_true')
    cmd.add_argument("--path", help = "path to the test data", default='test.dat')
>>>>>>> bf1b826a908169fa2340477f367736f63a5f7875
    args = cmd.parse_args()
    print args

    _logger.info("Loading model")
<<<<<<< HEAD
    model = pickle.load(open(args.model_path))
=======
    model = pickle.load(open(conv.redirect('naive.model')))
>>>>>>> bf1b826a908169fa2340477f367736f63a5f7875

    if args.as_server:
        serv(model)
    elif args.as_server_prob:
        serv_prob(model)
    else:
<<<<<<< HEAD
        test(model, args.path)

=======
Esempio n. 51
0
from train import Vectorizer


def gen(path, clf):
    X, y = load_data(path)
    scores = clf.decision_function(X)
    sz = len(y)
    with open("web_split.dat", "w") as outfile:
        for i in xrange(sz):
            assert y[i] == "web"
            score = scores[i]
            detail = sorted(zip(clf.named_steps["clf"].classes_, score), key=lambda x: -x[1])
            outfile.write("%s %f\n" % (detail[0][0], detail[0][1]))


if __name__ == "__main__":

    cmd = argparse.ArgumentParser()
    cmd.add_argument("--path", help="path to only-web training data")
    cmd.add_argument("--serv", help="run as server", dest="as_server", action="store_true")
    cmd.add_argument("--gen", help="generate training data", dest="generate", action="store_true")

    args = cmd.parse_args()

    _logger.info("loading model from %s" % "svm_ovr.model")
    clf = pickle.load(open("svm_ovr.model"))

    if args.generate:
        gen(args.path, clf)
Esempio n. 52
0
            accur = accuracy_score(y, pipeline.predict(X))
            pipeline.accur = accur
            _logger.info("Trainig accuracy (%s - %s): %f" % (p, q, accur))
        self.svms[p,q] = pipeline
        return pipeline
                    

if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--input", help="path of the training data", default=TRAIN_FILE_PATH)
    cmd.add_argument("--classes", help="the pair of classes need to train, train all combination if not specified",
                     nargs=2, default=None)
    cmd.add_argument("--cv", help="fold of cross validation 0 for not doing", default=0, type=int)
    args = cmd.parse_args()

    _logger.info("Loading training data from %s" % args.input)
    X, y = load_data(args.input)

    if args.classes:
        _logger.info("Will train 1v1 SVM between %s and %s" % (args.classes[0], args.classes[1]))
        gp = SVMGroup(cv=args.cv)
        if os.path.isfile("svms.model"):
            gp.svms = pickle.load(open("svms.model"))

        gp.collect_by_domain(X, y)
        gp.train_pair(args.classes[0], args.classes[1])

    else:
        gp = SVMGroup()
        _logger.info("Start training")
        gp.train(X, y)
Esempio n. 53
0
                terms, domain = line.split('\t')
                term_set = set()
                for term in terms.split(' '):
                    term = term_category(term)
                    if term not in term_set:
                        term_set.add(term)
                        self.count[(term, domain)] += 1
                c += 1
                if c % 10000 == 0:
                    _logger.debug("%d records processed" % c)

    def dump(self, path):
        with open(path, 'w') as outfile:
            for key, val in self.count.items():
                term, domain = key
                outfile.write("%s %s %d\n" % (term.encode('utf-8'), domain.encode('utf-8'), val))
                


if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--input", help="path of the training data")
    cmd.add_argument("--output", help="path to dump the model", default=DEFAULT_OUTPATH)
    args = cmd.parse_args()

    counter = Counter(args.input)
    _logger.info("training from %s" % args.input)
    counter.train()
    _logger.info("dumping model to %s" % args.output)
    counter.dump(args.output)
Esempio n. 54
0
            q = front_result[1][0]
            svm_pred = decode_svm.discriminate(p, q, sent)[0]
            discfile.write("%s\t%s\t%s\t%s\t%s\n" % \
                               (sent.encode('utf-8'),
                                p.encode('utf-8'),
                                q.encode('utf-8'),
                                svm_pred.encode('utf-8'), gold.encode('utf-8')))
            pred = svm_pred

        y_pred.append(pred)

        outfile.write("%s\t%s\t%s\n" % (sent.encode('utf-8'), pred.encode('utf-8'), gold.encode('utf-8')))

    _logger.info("ensembled accuracy: %f" % accuracy_score(y, y_pred))

    outfile.close()
    discfile.close()

if __name__ == "__main__":
    cmd = argparse.ArgumentParser()
    cmd.add_argument("--serv", help = "run as server", dest="as_server", action='store_true')
    cmd.add_argument("--path", help = "path to the test data", default=TEST_FILE_PATH)
    cmd.add_argument("--front-model-path", help = "path to the first stage model")
    args = cmd.parse_args()

    _logger.info("Loading naive bayes model from %s" % args.front_model_path)
    front = pickle.load(open(args.front_model_path))
    X, y = load_data(args.path)
    test(X, y)