コード例 #1
0
ファイル: modeling.py プロジェクト: Divergent914/kddcup2015
def lr_with_fs():
    """
    Submission: lr_with_fs_0620_02.csv
    E_val: <missing>
    E_in: 0.856252488379
    E_out: 0.8552577388980213
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)
    print(auc_score(clf, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('lr', clf)]), 'lr_with_fs_0620_02')
コード例 #2
0
ファイル: modeling.py プロジェクト: Sandy4321/kdd2015-2
def lr_with_fs():
    """
    Submission: lr_with_fs_0620_02.csv
    E_val: <missing>
    E_in: 0.856252488379
    E_out: 0.8552577388980213
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)
    print(auc_score(clf, X_new, y))
    to_submission(
        Pipeline([('scale_raw', raw_scaler), ('rfe', rfe),
                  ('scale_new', new_scaler), ('lr', clf)]),
        'lr_with_fs_0620_02')
コード例 #3
0
def send_sms(phone, text):
	options = {
		'api_id': config.SMS_ID,
		'to': phone,
		'text': text.encode('utf-8'),
	}
	if hasattr(config, 'SMS_FROM'):
		options['from'] = config.SMS_FROM
	util.fetch('http://sms.ru/sms/send', options)
	logging.info('Sent an SMS to %s' % phone)
コード例 #4
0
ファイル: queue.py プロジェクト: Alwnikrotikz/freemusic
def submit(id, url):
	"""
	Отправляет на сервер ссылку на готовый album.xml. Ссылка должна быть
	доступна извне, сервер будет её запрашивать.
	"""
	print "Submitting item %u (%s)" % (id, url)
	fetch('http://' + settings['host'] + '/upload/queue', {
		'id': id,
		'url': url,
		'signature': sign(url),
	})
コード例 #5
0
ファイル: modeling.py プロジェクト: Sandy4321/kdd2015-2
def sgd():
    """
    Submission: sgd_0620_03.csv
    E_val: 0.863628
    E_in: 0.854373
    E_out:
    """
    from sklearn.linear_model import SGDClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    sgd = SGDClassifier(n_iter=50, n_jobs=-1)
    params = {
        'loss': [
            'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive',
            'squared_epsilon_insensitive'
        ]
    }
    grid = GridSearchCV(sgd,
                        param_grid=params,
                        cv=StratifiedKFold(y, 5),
                        scoring='roc_auc',
                        n_jobs=-1)
    grid.fit(X_new, y)

    logger.debug('Best score (E_val): %f', grid.best_score_)

    sgd = grid.best_estimator_

    logger.debug('E_in: %f', auc_score(sgd, X_new, y))
    to_submission(
        Pipeline([('scale_raw', raw_scaler), ('rfe', rfe),
                  ('scale_new', new_scaler), ('sgd', sgd)]), 'sgd_0620_03')
コード例 #6
0
ファイル: modeling.py プロジェクト: Divergent914/kddcup2015
def lr():
    """
    Submission: lr_0618.csv
    E_val: <missing>
    E_in: <missing>
    E_out: 0.8119110960575004
    """
    from sklearn.linear_model import LogisticRegressionCV
    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))
    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X, y)
    print(auc_score(clf, X, y))
    to_submission(clf, 'lr_0618_xxx')
コード例 #7
0
ファイル: modeling.py プロジェクト: Sandy4321/kdd2015-2
def lr():
    """
    Submission: lr_0618.csv
    E_val: <missing>
    E_in: <missing>
    E_out: 0.8119110960575004
    """
    from sklearn.linear_model import LogisticRegressionCV
    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))
    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X, y)
    print(auc_score(clf, X, y))
    to_submission(clf, 'lr_0618_xxx')
コード例 #8
0
def load_test():
    """
    Load dataset for testing.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features.
    """
    pkl_path = util.cache_path('test_X')
    if os.path.exists(pkl_path):
        X = util.fetch(pkl_path)
    else:
        enroll_set = np.sort(util.load_enrollment_test()['enrollment_id'])
        # log = util.load_logs()
        # base_date = log['time'].max().to_datetime()
        base_date = datetime(2014, 8, 1, 22, 0, 47)
        X = None
        for f in MODELING['features']:
            X_ = f(enroll_set, base_date)
            if X is None:
                X = X_
            else:
                X = np.c_[X, X_]
        util.dump(X, pkl_path)
    return X
コード例 #9
0
def main():
    global update_id, misc, plugins, database
    url = "https://api.telegram.org/bot{}/getUpdates?offset={}".format(
        misc['token'], update_id)
    response = util.fetch(url, misc['session'])
    executor = concurrent.futures.ThreadPoolExecutor(
        max_workers=config.workers)  # See docs
    executor.submit(check_time_args)
    try:
        response = response.json()
    except (AttributeError, ValueError) as e:
        print("Error parsing Telegram response: {}\nResponse: {}".format(
            e, response))
        time.sleep(config.sleep)
        return
    if response['ok'] and response[
            'result']:  # Response ok and contains results
        update_id = response['result'][-1]['update_id'] + 1
        for result in response['result']:  # Loop through result
            executor.submit(run_extension(result))
            if 'message' in result:  # For message updates
                executor.submit(
                    RouteMessage(result['message'], misc, plugins,
                                 database).route_update)
            elif 'callback_query' in result:  # For callback query updates
                executor.submit(route_callback_query, result['callback_query'],
                                database, plugins, misc)
    elif not response['ok']:
        print('Response not OK\nResponse: {}'.format(response))
    executor.shutdown(
        wait=False
    )  # returns immediately, sub processes will close by themselves
    time.sleep(config.sleep)  # Sleep for time defined in config
コード例 #10
0
ファイル: app.py プロジェクト: Laterality/sibylla
def get_similarities():
    global model

    check_model()

    comparison = int(flask.request.args.get("article"))
    ctoken = util.tokenize([util.fetch(comparison)[1]])

    recent_articles = util.fetch_top_100(comparison)
    recent_article_ids = [a[0] for a in recent_articles]

    founds, not_founds = util.fetch_with(comparison, recent_article_ids)
    result_ids = [f[0] for f in founds]

    similarities = [f[1] for f in founds]

    not_found_articles = [a for a in recent_articles if a[0] in not_founds]

    for a in not_found_articles:
        sim = model.docvecs.similarity_unseen_docs(model, ctoken[0], util.tokenize([a[1]])[0])
        util.insert_similarity(comparison, a[0], sim.item())
        result_ids.append(a[0])
        similarities.append(str(sim))

    return flask.jsonify(
        result="ok",
        articleIds=result_ids,
        similarities=similarities
    )
コード例 #11
0
ファイル: dataset.py プロジェクト: Divergent914/kddcup2015
def load_test():
    """
    Load dataset for testing.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features.
    """
    pkl_path = util.cache_path('test_X')
    if os.path.exists(pkl_path):
        X = util.fetch(pkl_path)
    else:
        enroll_set = np.sort(util.load_enrollment_test()['enrollment_id'])
        # log = util.load_logs()
        # base_date = log['time'].max().to_datetime()
        base_date = datetime(2014, 8, 1, 22, 0, 47)
        X = None
        for f in MODELING['features']:
            X_ = f(enroll_set, base_date)
            if X is None:
                X = X_
            else:
                X = np.c_[X, X_]
        util.dump(X, pkl_path)
    return X
コード例 #12
0
ファイル: hitagi.py プロジェクト: ydnandy/hitagibot
def main():
    global update_id, misc, plugins, database
    url = "https://api.telegram.org/bot{}/getUpdates?offset={}".format(
        misc['token'], update_id)
    response = util.fetch(url, misc['session'])
    try:
        response = response.json()
    except AttributeError:
        print("Error parsing Telegram response\nResponse: {}".format(response))
        time.sleep(5)
        return
    if response['ok'] and response[
            'result']:  # Response ok and contains results
        update_id = response['result'][-1]['update_id'] + 1
        executor = concurrent.futures.ThreadPoolExecutor(
            max_workers=config.workers)  # See docs
        for result in response['result']:  # Loop through result
            if_old = int(time.time()) - int(result['message'][
                'date']) >= 180  # check if message is older than 3 min
            executor.submit(RouteMessage,
                            result['message'],
                            misc,
                            plugins,
                            database,
                            check_db_only=if_old)
        executor.shutdown(
            wait=False
        )  # returns immediately, sub processes will close by themselves
    elif not response['ok']:  # Response not ok
        print('Response not OK\nResponse: {}'.format(response))
    time.sleep(config.sleep)  # Sleep for time defined in config
コード例 #13
0
ファイル: modeling.py プロジェクト: Divergent914/kddcup2015
def sgd():
    """
    Submission: sgd_0620_03.csv
    E_val: 0.863628
    E_in: 0.854373
    E_out:
    """
    from sklearn.linear_model import SGDClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    sgd = SGDClassifier(n_iter=50, n_jobs=-1)
    params = {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge',
                 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive',
                 'squared_epsilon_insensitive']
    }
    grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5),
                        scoring='roc_auc', n_jobs=-1)
    grid.fit(X_new, y)

    logger.debug('Best score (E_val): %f', grid.best_score_)

    sgd = grid.best_estimator_

    logger.debug('E_in: %f', auc_score(sgd, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('sgd', sgd)]), 'sgd_0620_03')
コード例 #14
0
ファイル: app.py プロジェクト: Laterality/sibylla
def get_similarity():
    global model

    check_model()

    article_1 = flask.request.args.get("article1")
    article_2 = flask.request.args.get("article2")

    article1 = util.fetch(article_1)
    article2 = util.fetch(article_2)
    tokens = util.tokenize([article1[1], article2[1]])
    similarity = str(model.docvecs.similarity_unseen_docs(model, tokens[0], tokens[1]))

    return flask.jsonify(
        result="ok",
        similarity=str(similarity)
    )
コード例 #15
0
ファイル: handler.py プロジェクト: rjstreet/bgglookup-glass
 def _handle_timeline_notification(self, data):
   """Handle timeline notification."""
   for user_action in data.get('userActions', []):
     if user_action.get('type') == 'REPLY':
       # Fetch the timeline item.
       item = self.mirror_service.timeline().get(id=data['itemId']).execute()
       attachments = item.get('attachments', [])
       media = None
       if attachments:
         # Get the first attachment on that timeline item and do stuff with it.
         attachment = self.mirror_service.timeline().attachments().get(
             itemId=data['itemId'],
             attachmentId=attachments[0]['id']).execute()
         resp, content = self.mirror_service._http.request(
             attachment['contentUrl'])
         if resp.status == 200:
           media = MediaIoBaseUpload(
               io.BytesIO(content), attachment['contentType'],
               resumable=True)
         else:
           logging.info('Unable to retrieve attachment: %s', resp.status)
       bgg_resp = util.fetch('http://boardgamegeek.com/xmlapi2/search?query=%s' % urllib.quote(item.get('text','')))
       xmldoc = minidom.parseString( bgg_resp['data'] )
       itemlist = xmldoc.getElementsByTagName( "item" )
       item_id = itemlist[0].attributes[ 'id' ].value;
       bgg_resp = util.fetch("http://boardgamegeek.com/xmlapi2/thing?id=%s" % item_id )
       xmldoc = minidom.parseString( bgg_resp['data'] )
       itemlist = xmldoc.getElementsByTagName( "name" )
       bgg_name = itemlist[0].attributes[ 'value' ].value;
       itemlist = xmldoc.getElementsByTagName("thumbnail")
       bgg_thumbnail = itemlist[0].firstChild.data
       itemlist = xmldoc.getElementsByTagName("description")
       bgg_description = itemlist[0].firstChild.data
       logging.info( "<article>\n<figure>\n  <img src=\"%s\">\n</figure>\n<section>\n<div style=\"\">\n<p class=\"yellow\">%s</sub></p>\n<p>%s\n</section>\n</article>\n" % (bgg_thumbnail, bgg_name, bgg_description) )
       body = {
         'html': "<article>\n<figure>\n  <img src=\"%s\">\n</figure>\n<section>\n<div style=\"text-auto-size\">\n<p class=\"yellow\">%s</sub></p>\n<p>%s\n</section>\n</article>\n" % (bgg_thumbnail, bgg_name, bgg_description),
         'notification': { 'level' : "DEFAULT" }
       }
       self.mirror_service.timeline().insert(
           body=body, media_body=media).execute()
       # Only handle the first successful action.
       break
     else:
       logging.info(
           "I don't know what to do with this notification: %s", user_action)
コード例 #16
0
ファイル: tgapi.py プロジェクト: TopBakuhatsu/hitagibot
def get_me(misc):  # getMe
    url = "{}{}getMe".format(misc['base_url'], misc['token'])
    response = util.fetch(url, misc['session']).json()
    if response['ok']:
        return response['result']
    else:
        print("There seems to be an error :(\nCheck your API key and connection to the internet")
        print(response)
        sys.exit()
コード例 #17
0
ファイル: challenges.py プロジェクト: ypcat/vimgolf_rank
def update_challenges():
    """Fetch challenge list from vimgolf and update datastore."""
    logging.info('update_challenges()')

    rows = BeautifulSoup(fetch('/')).findAll('h5')
    count = increment('challenge_tasks', len(rows))
    logging.info('init challenge_tasks = %d' % count)
    for row in rows:
        handle = row.a['href'].split('/')[-1]
        taskqueue.add(url='/challenges/'+handle)
コード例 #18
0
ファイル: modeling.py プロジェクト: Sandy4321/kdd2015-2
def dt():
    """
    Submission: dt_0620_05.csv
    E_val: 0.820972
    E_in: 0.835177
    E_out:
    Comment: {'max_depth': 5}
    """
    from sklearn.tree import DecisionTreeClassifier, export_graphviz

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    dt = DecisionTreeClassifier(max_depth=5, class_weight='auto')
    dt.fit(X, y)

    export_graphviz(dt, 'tree.dot')

    logger.debug('E_in: %f', auc_score(dt, X, y))
    to_submission(dt, 'dt_0620_05')
コード例 #19
0
ファイル: tgapi.py プロジェクト: yukiisbored/hitagibot
def get_me(misc):  # getMe
    url = "{}{}getMe".format(misc['base_url'], misc['token'])
    response = util.fetch(url, misc['session']).json()
    if response['ok']:
        return response['result']
    else:
        print(
            "There seems to be an error :(\nCheck your API key and connection to the internet"
        )
        print(response)
        sys.exit()
コード例 #20
0
ファイル: modeling.py プロジェクト: Divergent914/kddcup2015
def dt():
    """
    Submission: dt_0620_05.csv
    E_val: 0.820972
    E_in: 0.835177
    E_out:
    Comment: {'max_depth': 5}
    """
    from sklearn.tree import DecisionTreeClassifier, export_graphviz

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    dt = DecisionTreeClassifier(max_depth=5, class_weight='auto')
    dt.fit(X, y)

    export_graphviz(dt, 'tree.dot')

    logger.debug('E_in: %f', auc_score(dt, X, y))
    to_submission(dt, 'dt_0620_05')
コード例 #21
0
ファイル: modeling.py プロジェクト: Sandy4321/kdd2015-2
def lr_with_scale():
    """
    Submission: lr_with_scale_0620_04.csv
    E_val: <missing>
    E_in: 0.857351105162
    E_out: 0.854097855439904
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_scaled, y)
    print(auc_score(clf, X_scaled, y))
    to_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]),
                  'lr_with_scale_0620_04')
コード例 #22
0
ファイル: modeling.py プロジェクト: Divergent914/kddcup2015
def lr_with_scale():
    """
    Submission: lr_with_scale_0620_04.csv
    E_val: <missing>
    E_in: 0.857351105162
    E_out: 0.854097855439904
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_scaled, y)
    print(auc_score(clf, X_scaled, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('lr', clf)]), 'lr_with_scale_0620_04')
コード例 #23
0
ファイル: index.py プロジェクト: wsluyu/monitor-mini
def geturl():
    f = open('urlTest.txt','r')
    urls = f.readlines()
    f.close()
    for i in range(0,len(urls)):
        test_url = urls[i].strip().split(',')
        hostname = urlparse(test_url[0]).netloc
        path = urlparse(test_url[0]).path + '?' + urlparse(test_url[0]).query
        res = util.fetch(hostname, path)
        
        if not(util.assertEqual(res.status, int(test_url[1]))):
            # util.reportEmail('http status error:' + status)
            print 'failed', test_url[0]
        else:
            print 'success', test_url[0]
コード例 #24
0
ファイル: challenges.py プロジェクト: ypcat/vimgolf_rank
def update_challenge(handle):
    """Fetch Leaderboard and active golfers of the specified challenge, and update datastore."""
    logging.info('update_challenge(%s)' % handle)

    soup = BeautifulSoup(fetch('challenges/' + handle))
    title = soup.findAll('h3')[1].text
    golfers = [row.text.split('@')[-1] for row in soup.findAll('h5')[-1].parent.findAll('h6')]
    record = Challenge(key_name=handle, handle=handle, title=title, active_golfers=golfers)
    record.put()
    logging.info('updated Challenge(%s, %s) with %d golfers' % (handle, title, len(golfers)))

    count = increment('challenge_tasks', -1)
    logging.info('challenge_tasks = %d' % count)
    if count == 0:
        taskqueue.add(url='/top')
コード例 #25
0
ファイル: hitagi.py プロジェクト: ydnandy/hitagibot
def main():
    global update_id, misc, plugins, database
    url = "https://api.telegram.org/bot{}/getUpdates?offset={}".format(misc['token'], update_id)
    response = util.fetch(url, misc['session'])
    try:
        response = response.json()
    except AttributeError:
        print("Error parsing Telegram response\nResponse: {}".format(response))
        time.sleep(5)
        return
    if response['ok'] and response['result']:  # Response ok and contains results
        update_id = response['result'][-1]['update_id'] + 1
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=config.workers)  # See docs
        for result in response['result']:  # Loop through result
            if_old = int(time.time()) - int(result['message']['date']) >= 180  # check if message is older than 3 min
            executor.submit(RouteMessage, result['message'], misc, plugins, database, check_db_only=if_old)
        executor.shutdown(wait=False)  # returns immediately, sub processes will close by themselves
    elif not response['ok']:  # Response not ok
        print('Response not OK\nResponse: {}'.format(response))
    time.sleep(config.sleep)  # Sleep for time defined in config
コード例 #26
0
 def get_update(self):  # Gets new messages and sends them to plugin_handler
     url = "{}{}getUpdates?offset={}".format(self.misc['base_url'], self.misc['token'], self.update_id)
     response = util.fetch(url, self.misc['session'])
     try:
         response = response.json()
     except AttributeError:
         time.sleep(5)
         return
     if response['ok']:
         try:
             self.update_id = response['result'][-1]['update_id'] + 1
         except IndexError:
             time.sleep(self.config.sleep)
             return
         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as e:
             for i in response['result']:
                 if self.time - int(i['message']['date']) <= 180000:
                     e.submit(self.route_message, TelegramApi(i['message'], self.misc))
         time.sleep(self.config.sleep)
     else:
         print('Error fetching new messages:\nCode: {}'.format(response['error_code']))
         time.sleep(self.config.sleep)
コード例 #27
0
def dropout_history(enrollment_set, base_date):
    X_pkl_path = util.cache_path('dropout_history_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(X_pkl_path):
        return util.fetch(X_pkl_path)

    logger = logging.getLogger('dropout_history')

    n_proc = par.cpu_count()

    pkl_path = util.cache_path('Dropout_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        logger.debug('load from cache')

        Dropout_count = util.fetch(pkl_path)
    else:
        logger.debug('preparing datasets')

        Enroll_all = util.load_enrollments()

        Log = util.load_logs()
        Log = Log[Log['time'] <= base_date]
        Log_enroll_ids = pd.DataFrame(np.unique(Log['enrollment_id']),
                                      columns=['enrollment_id'])

        logger.debug('datasets prepared')

        params = []
        enroll_ids = []
        for i, df in Log.groupby(['enrollment_id']):
            params.append(df)
            enroll_ids.append(i)
        pool = par.Pool(processes=min(n_proc, len(params)))
        enroll_dropout_count = dict(
            zip(enroll_ids, pool.map(__get_dropout_feature__, params)))
        pool.close()
        pool.join()

        enroll_dropout_count = pd.Series(enroll_dropout_count,
                                         name='dropout_count')
        enroll_dropout_count.index.name = 'enrollment_id'
        enroll_dropout_count = enroll_dropout_count.reset_index()

        Enroll_counted = pd.merge(Enroll_all,
                                  enroll_dropout_count,
                                  how='left',
                                  on=['enrollment_id'])
        Dropout_count = pd.merge(Log_enroll_ids,
                                 Enroll_counted,
                                 how='left',
                                 on=['enrollment_id'])

        util.dump(Dropout_count, pkl_path)

    Dgb = Dropout_count.groupby('username')
    total_dropout = Dgb.agg({
        'dropout_count': np.sum
    }).reset_index().rename(columns={'dropout_count': 'total_dropout'})
    avg_dropout = Dgb.agg({
        'dropout_count': np.average
    }).reset_index().rename(columns={'dropout_count': 'avg_dropout'})
    drop_courses = Dgb.agg(
        {'dropout_count': lambda x: len([i for i in x if i > 0])})\
        .reset_index().rename(columns={'dropout_count': 'drop_courses'})
    course_count = Dgb.agg({
        'dropout_count': len
    }).reset_index().rename(columns={'dropout_count': 'course_count'})

    Dropout_count = pd.merge(Dropout_count,
                             total_dropout,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             avg_dropout,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             drop_courses,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             course_count,
                             how='left',
                             on=['username'])

    Dropout_count['drop_ratio'] = (Dropout_count['drop_courses'] /
                                   Dropout_count['course_count'])

    Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\
        .reset_index()

    X = pd.merge(Enroll, Dropout_count, how='left', on=['enrollment_id'])\
        .as_matrix(columns=['dropout_count', 'total_dropout', 'avg_dropout',
                            'drop_courses', 'course_count', 'drop_ratio'])

    logger.debug('dropout history, has nan: %s, shape: %s',
                 np.any(np.isnan(X)), repr(X.shape))

    util.dump(X, X_pkl_path)
    return X
コード例 #28
0
ファイル: bgmf.py プロジェクト: IDmy/blockgmf
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=10, steps=10, gpu_steps=2, alpha=0.0002, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''):

    U, V = initUV( np.max(users)-np.min(users)+1, latent, np.max(movies)-np.min(movies)+1)
    U = np.array(U)
    V = np.array(V)

    size = max(np.max(users)+1, np.max(movies)+1)
    split = int(size/blocks)
    us = int(math.ceil( np.float(np.max(users))/split ) )
    vs = int(math.ceil( np.float(np.max(movies))/split ) )
    if debug>1:
        print("Total splits : ",split, us, vs, us*vs)
        print("U, V shapes :", U.shape, V.shape)

    start_time=time.clock()
    y1, y2 = [], []
    count, error = 0, 100
    
    for k in range(steps):

        if debug>1:
            print("Step : ", k)

        u1, v1 = 0, 0
        t4 = time.clock()

        for i in range(us):
            u1 = i*split
            if np.max(users) < u1:
                u1 = int(np.max(users))

            u2 = ((i+1)*split - 1)
            if np.max(users) < u2:
                u2 = int(np.max(users))

            stemp = 0
            UU, MM, RR = [], [], []
            ulimits = [0]
           
            for j in range(vs):
                xtemp = int((i+stemp)%us)

                print("i, j, ii, jj ", i, j, xtemp, j)

                u1 = xtemp*split
                if np.max(users) < u1:
                    u1 = int(np.max(users))

                u2 = ((xtemp+1)*split - 1)
                if np.max(users) < u2:
                    u2 = int(np.max(users))

                v1 = j*split
                if np.max(movies) < v1:
                    v1 = int(np.max(movies))
                    
                v2 = (j+1)*split -1
                if np.max(movies) < v2:
                    v2 = int(np.max(movies))

                print("Processing split : " , i , j, u1, u2, v1, v2)

                uu, mm, rr = fetch(u1,u2, v1,v2, users,movies,ratings)

                if(len(uu)!=0 and len(mm)!=0):
                    UU,MM,RR, ulimits = pack(UU,MM,RR, uu,mm,rr, ulimits)

                stemp+=1
            U, V = matrix_factorization(UU,MM,RR, U,V, ulimits,np.min(users), np.min(movies))

        t5 = time.clock()
        if debug>1:
            print(" Step time taken : ", round(t5-t4,2))

        y1.append(round(t5-start_time,3))
        train_rmse = rmse(users, movies, ratings, U, V)
        test_rmse = rmse(test_users, test_movies, test_ratings, U, V)
        print("Train error:", round(train_rmse, 3) , " Test error:", round(test_rmse,3) )
        y2.append(round(test_rmse,3) )

        step_error=round(test_rmse,4)
        
        if step_error < delta:
            break
        elif error<step_error :
            break
        elif rmse_repeat_count<count:
            break
        elif step_error==error:
            count=count+1
        else:
            count = 0
        error=step_error

    np.savetxt('blocks_'+str(gpu_steps)+'iterations_y2.txt', y2, fmt='%.3f')
    np.savetxt('blocks_'+str(gpu_steps)+'iterations_y1.txt', y1, fmt='%.3f')
コード例 #29
0
ファイル: modeling.py プロジェクト: Divergent914/kddcup2015
def svc_1():
    """
    Submission: svc_1_0620_01.csv
    E_val: 0.866856950449
    E_in: 0.855948
    E_out: 0.8546898189645258
    """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import LinearSVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFE
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.linear_model import LogisticRegression
    from scipy.stats import expon

    logger.debug('svc_1')

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1,
              n_features_to_select=21)
    rfe.fit(X_scaled, y)
    util.dump(rfe, util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    logger.debug('Features selected.')

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
                            cv=StratifiedKFold(y, 5),
                            param_distributions={'C': expon()})
    rs.fit(X_new, y)

    logger.debug('Got best SVC.')
    logger.debug('Grid scores: %s', rs.grid_scores_)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('Best params: %s', rs.best_params_)

    svc = rs.best_estimator_
    util.dump(svc, util.cache_path('new_data.SVC'))

    isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5),
                                      method='isotonic')
    isotonic.fit(X_new, y)
    util.dump(isotonic,
              util.cache_path('new_data.CalibratedClassifierCV.isotonic'))

    logger.debug('Got best isotonic CalibratedClassifier.')
    logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y))

    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('svc', isotonic)]), 'svc_1_0620_01')
コード例 #30
0
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=10, steps=10, gpu_steps=2, alpha=0.00001, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''):

    #U, V = np.ones((R.shape[0], latent)), np.ones((latent, R.shape[1]))
    size = max(np.max(users)+1, np.max(movies)+1)
    split = int(size/blocks)
    us = int(math.ceil( np.float(np.max(users))/split ) )
    vs = int(math.ceil( np.float(np.max(movies))/split ) )
    if debug>1:
        print("Total splits : ",split, us, vs, us*vs)
        print("U, V shapes :", U.shape, V.shape)

    start_time=time.clock()
    y1, y2 = [], []
    count = 0

    flag1 = error(R, U, V,0, R.shape[1], 0, R.shape[0]) 
    
    for k in range(steps):

        if debug>1:
            print("Step : ", k)

        rmse = 0
        u1, v1 = 0, 0

        t4 = time.clock()
        for i in range(us):
            u1 = i*split
            if np.max(users) < u1:
                u1 = int(np.max(users))

            u2 = ((i+1)*split - 1)
            if np.max(users) < u2:
                u2 = int(np.max(users))
            
            stemp = 0
            tpool = [None]*vs
            for j in range(vs):
                xtemp = int((i+stemp)%us)
                
                print("i, j, ii, jj ", i, j, xtemp, j)

                u1 = xtemp*split
                if np.max(users) < u1:
                    u1 = int(np.max(users))

                u2 = ((xtemp+1)*split - 1)
                if np.max(users) < u2:
                    u2 = int(np.max(users))
 
                v1 = j*split
                if np.max(movies) < v1:
                    v1 = int(np.max(movies))

                v2 = (j+1)*split -1
                if np.max(movies) < v2:
                    v2 = int(np.max(movies))

                #print("Processing split : " , i , j, u1, u2, v1, v2)

                uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings)
                if debug>1:
                    print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape)
                t6 = time.clock()
                P, Q = U[u1:u2+1, 0:latent], V[0:latent, v1:v2+1]
                if debug>1:
                    print("P Q shapes : " , P.shape, Q.shape)
                t7 = time.clock()

                if debug>1:
                    print("Length of uu,mm ", len(uu), len(mm), u2-u1+1, v2-v1+1, P.shape, Q.shape)

                if(len(uu)!=0 and len(mm)!=0):
                    t = tpool[j]
                    if t is not None:
                        while t.isAlive():
                            print('waiting for the thread ...')
                            time.sleep(5)
                 	
                    t = threading.Thread(target=block_factorization, args=(P,Q,R, u1, u2, v1, v2, gpu_steps))
                    tpool[j] = t
                    t.start()
                    t8 = time.clock()

                stemp+=1

        t5 = time.clock()
        if debug>1:
            print(" Step time taken : ", round(t5-t4,2))
        y1.append(round(t5-start_time,3))
        
        # temporarily commented out to work out of train data set alone 
        test_rmse = error(R, U, V,0, R.shape[1], 0, R.shape[0]) #e(U, V , test_users, test_movies, test_ratings, min(split, max(np.max(test_users), np.max(test_movies))), latent=latent, debug=debug)
        print("Step error :", round(test_rmse,3) )
        y2.append(round(test_rmse,3) )

        flag=round(test_rmse,4)
        gpu_steps = int(gpu_steps*flag/flag1)

        #if flag < delta:
        #    break
        #elif flag1<flag :
        #    break
        #elif rmse_repeat_count<count:
        #    break
        #elif flag==flag1:
        #    count=count+1
        #else:
        #    count = 0
        #flag1=flag

    np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y2.txt', y2, fmt='%.3f')
    np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y1.txt', y1, fmt='%.3f')
コード例 #31
0
ファイル: modeling.py プロジェクト: Sandy4321/kdd2015-2
def svc_1():
    """
    Submission: svc_1_0620_01.csv
    E_val: 0.866856950449
    E_in: 0.855948
    E_out: 0.8546898189645258
    """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import LinearSVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFE
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.linear_model import LogisticRegression
    from scipy.stats import expon

    logger.debug('svc_1')

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = RFE(estimator=LogisticRegression(class_weight='auto'),
              step=1,
              n_features_to_select=21)
    rfe.fit(X_scaled, y)
    util.dump(rfe, util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    logger.debug('Features selected.')

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc,
                            n_iter=50,
                            scoring='roc_auc',
                            n_jobs=-1,
                            cv=StratifiedKFold(y, 5),
                            param_distributions={'C': expon()})
    rs.fit(X_new, y)

    logger.debug('Got best SVC.')
    logger.debug('Grid scores: %s', rs.grid_scores_)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('Best params: %s', rs.best_params_)

    svc = rs.best_estimator_
    util.dump(svc, util.cache_path('new_data.SVC'))

    isotonic = CalibratedClassifierCV(svc,
                                      cv=StratifiedKFold(y, 5),
                                      method='isotonic')
    isotonic.fit(X_new, y)
    util.dump(isotonic,
              util.cache_path('new_data.CalibratedClassifierCV.isotonic'))

    logger.debug('Got best isotonic CalibratedClassifier.')
    logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y))

    to_submission(
        Pipeline([('scale_raw', raw_scaler), ('rfe', rfe),
                  ('scale_new', new_scaler), ('svc', isotonic)]),
        'svc_1_0620_01')
コード例 #32
0
def load_train(earlist_base_date=None, depth=1, cache_only=False):
    """
    Load dataset for training and validating.

    *NOTE*  If you need a validating set, you SHOULD split from training set
    by yourself.

    Parameters
    ----------
    earlist_base_date: datetime, None by default
    Base date won't be smaller than earlist_base_date.

    depth: int, 1 by default
    Maximum moves of time window.

    cache_only: bool, False by default
    Cache data of every period, do not return full spanned data.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features. It is the features of all time if cache_only is True.

    y: numpy ndarray, shape: (num_of_enrollments,)
    Vector of labels. It is the labels of all time if cache_only is True.
    """
    logger = logging.getLogger('load_train')

    enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id'])
    log = util.load_logs()[['enrollment_id', 'time']]
    # base_date = log['time'].max().to_datetime()
    base_date = datetime(2014, 8, 1, 22, 0, 47)

    logger.debug('load features before %s', base_date)

    pkl_X_path = util.cache_path('train_X_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    pkl_y_path = util.cache_path('train_y_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
        logger.debug('fetch cached')
        X = util.fetch(pkl_X_path)
        y = util.fetch(pkl_y_path)
    else:
        X, _ = __load_dataset__(enroll_ids, log, base_date)
        y_with_id = util.load_val_y()
        if not np.all(y_with_id[:, 0] == enroll_ids):
            logger.fatal('something wrong with enroll_ids')
            raise RuntimeError('something wrong with enroll_ids')
        y = y_with_id[:, 1]

        util.dump(X, pkl_X_path)
        util.dump(y, pkl_y_path)

    # base_date = log['time'].max().to_datetime() - timedelta(days=10)
    base_date = datetime(2014, 7, 22, 22, 0, 47)
    Dw = timedelta(days=7)
    enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)
    for _ in range(depth - 1):
        if enroll_ids.size <= 0:
            break
        if earlist_base_date is not None and base_date < earlist_base_date:
            break

        logger.debug('load features before %s', base_date)

        # get instances and labels
        pkl_X_path = util.cache_path('train_X_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        pkl_y_path = util.cache_path('train_y_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
            logger.debug('fetch cached')
            X_temp = util.fetch(pkl_X_path)
            y_temp = util.fetch(pkl_y_path)
        else:
            X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date)

            util.dump(X_temp, pkl_X_path)
            util.dump(y_temp, pkl_y_path)

        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y
コード例 #33
0
ファイル: dataset.py プロジェクト: Divergent914/kddcup2015
def load_train(earlist_base_date=None, depth=1, cache_only=False):
    """
    Load dataset for training and validating.

    *NOTE*  If you need a validating set, you SHOULD split from training set
    by yourself.

    Parameters
    ----------
    earlist_base_date: datetime, None by default
    Base date won't be smaller than earlist_base_date.

    depth: int, 1 by default
    Maximum moves of time window.

    cache_only: bool, False by default
    Cache data of every period, do not return full spanned data.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features. It is the features of all time if cache_only is True.

    y: numpy ndarray, shape: (num_of_enrollments,)
    Vector of labels. It is the labels of all time if cache_only is True.
    """
    logger = logging.getLogger('load_train')

    enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id'])
    log = util.load_logs()[['enrollment_id', 'time']]
    # base_date = log['time'].max().to_datetime()
    base_date = datetime(2014, 8, 1, 22, 0, 47)

    logger.debug('load features before %s', base_date)

    pkl_X_path = util.cache_path('train_X_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    pkl_y_path = util.cache_path('train_y_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
        logger.debug('fetch cached')
        X = util.fetch(pkl_X_path)
        y = util.fetch(pkl_y_path)
    else:
        X, _ = __load_dataset__(enroll_ids, log, base_date)
        y_with_id = util.load_val_y()
        if not np.all(y_with_id[:, 0] == enroll_ids):
            logger.fatal('something wrong with enroll_ids')
            raise RuntimeError('something wrong with enroll_ids')
        y = y_with_id[:, 1]

        util.dump(X, pkl_X_path)
        util.dump(y, pkl_y_path)

    # base_date = log['time'].max().to_datetime() - timedelta(days=10)
    base_date = datetime(2014, 7, 22, 22, 0, 47)
    Dw = timedelta(days=7)
    enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)
    for _ in range(depth - 1):
        if enroll_ids.size <= 0:
            break
        if earlist_base_date is not None and base_date < earlist_base_date:
            break

        logger.debug('load features before %s', base_date)

        # get instances and labels
        pkl_X_path = util.cache_path('train_X_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        pkl_y_path = util.cache_path('train_y_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
            logger.debug('fetch cached')
            X_temp = util.fetch(pkl_X_path)
            y_temp = util.fetch(pkl_y_path)
        else:
            X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date)

            util.dump(X_temp, pkl_X_path)
            util.dump(y_temp, pkl_y_path)

        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y
コード例 #34
0
def mf_rmse(U, V, users, movies, ratings, split, latent=30, debug=1):

    us = int(math.ceil(np.float(np.max(users)) / split))
    vs = int(math.ceil(np.float(np.max(movies)) / split))

    u1, v1 = 0, 0
    error = 0.0
    totnum = 0
    totmse = 0.0
    t4 = time.clock()
    for i in range(us):

        u1 = i * split
        if np.max(users) < u1:
            u1 = int(np.max(users))

        u2 = ((i + 1) * split - 1)
        if np.max(users) < u2:
            u2 = int(np.max(users))

        for j in range(vs):
            v1 = j * split
            if np.max(movies) < v1:
                v1 = int(np.max(movies))

            v2 = (j + 1) * split - 1
            if np.max(movies) < v2:
                v2 = int(np.max(movies))

            if debug > 1:
                print("Processing split : ", i, j, u1, u2, v1, v2)

            uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings)
            if debug > 1:
                print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape)

            t6 = time.clock()
            P, Q = U[u1:u2 + 1, 0:latent], V[0:latent, v1:v2 + 1]
            P = P.reshape(P.shape[0] * P.shape[1], 1).astype(np.float32)
            Q = Q.reshape(Q.shape[0] * Q.shape[1], 1).astype(np.float32)

            tools.clear_context_caches()
            a_gpu = gpuarray.to_gpu(P)
            b_gpu = gpuarray.to_gpu(Q)

            t7 = time.clock()
            u_gpu = gpuarray.to_gpu(uu)
            v_gpu = gpuarray.to_gpu(mm)
            r_gpu = gpuarray.to_gpu(rr)

            ex_gpu = gpuarray.zeros((3072, 1), np.float32)
            ey_gpu = gpuarray.zeros((3072, 1), np.int32)

            if len(uu) > 0:
                rmse(a_gpu,
                     b_gpu,
                     u_gpu,
                     v_gpu,
                     r_gpu,
                     ex_gpu,
                     ey_gpu,
                     np.int32(u2 - u1 + 1),
                     np.int32(latent),
                     np.int32(v2 - v1 + 1),
                     np.int32(u1),
                     np.int32(u2),
                     np.int32(v1),
                     np.int32(v2),
                     np.int32(len(uu)),
                     np.int32(len(mm)),
                     block=(16, 16, 1),
                     grid=(3, 4))
                ex = ex_gpu.get()
                ey = ey_gpu.get()
                num = np.sum(ey)
                mse = np.sum(np.dot(ex.T, ey))
                temp = np.float((totnum + num))

                error = error * (totnum / temp) + (mse / temp)
                totnum += num
                totmse += mse
                if debug > 1:
                    print(" mse , error ", totmse, mse, mse / num, error, num,
                          len(uu))

            t8 = time.clock()

    return np.sqrt(error)
コード例 #35
0
ファイル: blockcpumf.py プロジェクト: anonymousicml/blockmf
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, blocks=1, latent=10, steps=10, gpu_steps=1, alpha=0.000001, beta=0.01, delta=0.01, rmse_repeat_count=3, debug=2, dataset=''):

    U, V = np.ones((R.shape[0], latent)), np.ones((latent, R.shape[1]))
    size = max(np.max(users)+1, np.max(movies)+1)
    split = int(size/blocks)
    us = int(math.ceil( np.float(np.max(users))/split ) )
    vs = int(math.ceil( np.float(np.max(movies))/split ) )
    if debug>1:
        print("Total splits : ",split, us, vs, us*vs)
        print("U, V shapes :", U.shape, V.shape)

    start_time=time.clock()
    y1, y2 = [], []
    flag1, count = 1000, 0

    for k in range(steps):

        if debug>1:
            print("Step : ", k)

        rmse = 0
        u1, v1 = 0, 0

        t4 = time.clock()
        for i in range(us):
            u1 = i*split
            if np.max(users) < u1:
                u1 = int(np.max(users))

            u2 = ((i+1)*split - 1)
            if np.max(users) < u2:
                u2 = int(np.max(users))

            for j in range(vs):
                v1 = j*split
                if np.max(movies) < v1:
                    v1 = int(np.max(movies))

                v2 = (j+1)*split -1
                if np.max(movies) < v2:
                    v2 = int(np.max(movies))

                #print("Processing split : " , i , j, u1, u2, v1, v2)

                uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings)
                if debug>1:
                    print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape)
                t6 = time.clock()
                P, Q = U[u1:u2+1, 0:latent], V[0:latent, v1:v2+1]
                if debug>1:
                    print("P Q shapes : " , P.shape, Q.shape)
                t7 = time.clock()

                if debug>1:
                    print("Length of uu,mm ", len(uu), len(mm), u2-u1+1, v2-v1+1, P.shape, Q.shape)

                if(len(uu)!=0 and len(mm)!=0):
                    P,Q = block_factorization(P,Q,R, u1, u2, v1, v2, steps=gpu_steps)
                    t8 = time.clock()

                    if debug>1:
                        print("Shape of P, Q :", P.shape, Q.shape)

                    U[u1:u2+1, 0:latent] = P.reshape( (u2-u1+1, latent))
                    V[0:latent, v1:v2+1] = Q.reshape( (latent, v2-v1+1))
                    t9 = time.clock()
                    if debug>1:
                        print("Timer :", round(t7-t6,2), round(t8-t7,2), round(t9-t8,2))
                    temp = error(R, P,Q, u1, u2, v1, v2)
                    rmse += temp
                    if debug>1:
                        print("Completed processing : ", i , j, round(rmse,3))

        t5 = time.clock()
        if debug>1:
            print(" Step time taken : ", round(t5-t4,2))
        y1.append(round(t5-start_time,3))
        
        # temporarily hardcoded to work just with train dataset alone
        test_rmse = error(R, U, V,0, R.shape[1], 0, R.shape[0]) #e(U, V , test_users, test_movies, test_ratings, min(split, max(np.max(test_users), np.max(test_movies))), latent=latent, debug=debug)
        print("Step error :", round(test_rmse,3) )
        y2.append(round(test_rmse,3) )

        flag=round(test_rmse,4)

        # un-comment for early convergence stopping 
        # if flag < delta:
        #    break
        #elif flag1<flag :
        #    break
        #elif rmse_repeat_count<count:
        #    break
        #elif flag==flag1:
        #    count=count+1
        #else:
        #    count = 0
        #flag1=flag

    np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y2.txt', y2, fmt='%.3f')
    np.savetxt(str(blocks*blocks)+'blocks_'+str(gpu_steps)+'iterations_y1.txt', y1, fmt='%.3f')
コード例 #36
0
                        return servers[i]

            else:
                server = "ss13"
                return servers[server]


    while True:
        try:
            server = get_server()
            print(server)
            if len(server) == 5:
                try:
                    if server[4] == "fetch":
                        status = util.fetch(server[2], server[3], "status")
                    elif server[4] == "http":
                        status = requests.get(server[2]).json()
                    #print(status)
                    if server[0] in ["Baystation 12"]:
                        details = status["map"]+" | "+str(status["players"])+" players"
                    elif server[0] in ["Goonstation #2","Goonstation RP #1", "BeeStation", "FTL13", "Station Bagil", "Station Terry", "Station Sybil", "Citadel Station"]:
                        details = status["map_name"]+" | "+str(status["players"])+" players"

                    if server[0] in ["Goonstation #2","Goonstation RP #1"]:
                        if status["shuttle_time"] != 'welp' and status["shuttle_time"] != '600':
                            rp.set_activity(state=server[0],details=details,large_text=server[0],large_image=server[1], start=int(time.time())-int(status["elapsed"]), end=int(time.time())+int(status["shuttle_time"]))
                        else:
                            rp.set_activity(state=server[0],details=details,large_text=server[0],large_image=server[1], start=int(time.time())-int(status["elapsed"]))

                    elif server[0] in ["BeeStation", "FTL13", "Station Bagil", "Station Terry", "Station Sybil", "Citadel Station"]:
コード例 #37
0
ファイル: cpmf.py プロジェクト: IDmy/blockgmf
def factorize(users,
              movies,
              ratings,
              test_users,
              test_movies,
              test_ratings,
              blocks=1,
              latent=30,
              steps=10,
              block_steps=1,
              alpha=0.00001,
              beta=0.01,
              delta=0.01,
              rmse_repeat_count=3,
              debug=2,
              dataset=''):
    global U, V
    U, V = initUV(np.max(users) + 1, latent, np.max(movies) + 1)
    R = csr_matrix((ratings, (users, movies))).todense()

    size = max(np.max(users) + 1, np.max(movies) + 1)
    split = int(size / blocks)
    us = int(math.ceil(np.float(np.max(users)) / split))
    vs = int(math.ceil(np.float(np.max(movies)) / split))
    if debug > 1:
        print("Total splits : ", split, us, vs, us * vs)
        print("U, V shapes :", U.shape, V.shape)

    start_time = time.clock()
    y1, y2 = [], []
    count, error = 0, 100

    for k in range(steps):

        if debug > 1:
            print("Step : ", k)

        u1, v1 = 0, 0

        t4 = time.clock()
        for i in range(us):
            u1 = i * split
            if np.max(users) < u1:
                u1 = int(np.max(users))

            u2 = ((i + 1) * split - 1)
            if np.max(users) < u2:
                u2 = int(np.max(users))

            stemp = 0
            tpool = [None] * vs
            for j in range(vs):
                xtemp = int((i + stemp) % us)

                if debug > 1:
                    print("i, j, ii, jj ", i, j, xtemp, j)

                u1 = xtemp * split
                if np.max(users) < u1:
                    u1 = int(np.max(users))

                u2 = ((xtemp + 1) * split - 1)
                if np.max(users) < u2:
                    u2 = int(np.max(users))

                v1 = j * split
                if np.max(movies) < v1:
                    v1 = int(np.max(movies))

                v2 = (j + 1) * split - 1
                if np.max(movies) < v2:
                    v2 = int(np.max(movies))

                if debug > 1:
                    print("Processing split : ", i, j, u1, u2, v1, v2)

                uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings)
                if debug > 1:
                    print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape)
                t6 = time.clock()
                P, Q = U[u1:u2 + 1, 0:latent], V[v1:v2 + 1, 0:latent]
                if debug > 1:
                    print("P Q shapes : ", P.shape, Q.shape)
                t7 = time.clock()

                if debug > 1:
                    print("Length of uu,mm ", len(uu), len(mm), u2 - u1 + 1,
                          v2 - v1 + 1, P.shape, Q.shape)

                if (len(uu) != 0 and len(mm) != 0):
                    t = tpool[j]
                    if t is not None:
                        while t.isAlive():
                            print('waiting for the thread ...')
                            time.sleep(5)

                    t = threading.Thread(target=block_factorization,
                                         args=(P, Q, R, u1, u2, v1, v2,
                                               block_steps))
                    tpool[j] = t
                    t.start()
                    t8 = time.clock()

                stemp += 1

        t5 = time.clock()
        if debug > 1:
            print(" Step time taken : ", round(t5 - t4, 2))
        y1.append(round(t5 - start_time, 3))
        test_rmse = rmse(test_users, test_movies, test_ratings, U, V)
        print("Step error :", round(test_rmse, 3))
        y2.append(round(test_rmse, 3))

        step_error = round(test_rmse, 4)

        if step_error < delta:
            break
        elif error < step_error:
            break
        elif rmse_repeat_count < count:
            break
        elif error == step_error:
            count = count + 1
        else:
            count = 0
        error = step_error

    np.savetxt(str(blocks * blocks) + 'blocks_y2.txt', y2, fmt='%.3f')
    np.savetxt(str(blocks * blocks) + 'blocks_y1.txt', y1, fmt='%.3f')
コード例 #38
0
def source_event_counter(enrollment_set, base_date):
    """
    Counts the source-event pairs.

    Features
    --------
    """
    X_pkl_path = util.cache_path('source_event_counter_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(X_pkl_path):
        return util.fetch(X_pkl_path)

    logger = logging.getLogger('source_event_counter')
    logger.debug('preparing datasets')

    Enroll_all = util.load_enrollments()

    pkl_path = util.cache_path('Log_all_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        Log = util.fetch(pkl_path)
    else:
        Log = util.load_logs()
        Log = Log[Log['time'] <= base_date]
        Log['source_event'] = Log['source'] + '-' + Log['event']
        Log['day_diff'] = (base_date - Log['time']).dt.days
        Log['week_diff'] = Log['day_diff'] // 7
        Log['event_count'] = 1

        util.dump(Log, pkl_path)

    Log_counted = Log.groupby(['enrollment_id', 'source_event', 'week_diff'])\
        .agg({'event_count': np.sum}).reset_index()

    logger.debug('datasets prepared')

    Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\
        .reset_index()

    n_proc = par.cpu_count()

    pkl_path = util.cache_path('event_count_by_eid_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        event_count_by_eid = util.fetch(pkl_path)
    else:
        params = []
        eids = []
        for eid, df in pd.merge(Enroll_all, Log_counted, on=['enrollment_id'])\
                .groupby(['enrollment_id']):
            params.append(df)
            eids.append(eid)
        pool = par.Pool(processes=min(n_proc, len(params)))
        event_count_by_eid = dict(
            zip(eids, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(event_count_by_eid, pkl_path)

    X0 = np.array([event_count_by_eid[i] for i in Enroll['enrollment_id']])

    logger.debug('source-event pairs counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X0)), repr(X0.shape))

    pkl_path = util.cache_path('D_full_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        D_full = util.fetch(pkl_path)
    else:
        D_full = pd.merge(Enroll_all, Log, on=['enrollment_id'])

        util.dump(D_full, pkl_path)

    pkl_path = util.cache_path('user_wn_courses_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_wn_courses = util.fetch(pkl_path)
    else:
        user_wn_courses = {}
        for u, df in D_full.groupby(['username']):
            x = []
            for wn in __week_span__:
                x.append(len(df[df['week_diff'] == wn]['course_id'].unique()))
            user_wn_courses[u] = x

        util.dump(user_wn_courses, pkl_path)

    X1 = np.array([user_wn_courses[u] for u in Enroll['username']])

    logger.debug('courses by user counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X1)), repr(X1.shape))

    pkl_path = util.cache_path('course_population_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_population = util.fetch(pkl_path)
    else:
        course_population = {}
        for c, df in D_full.groupby(['course_id']):
            course_population[c] = len(df['username'].unique())

        util.dump(course_population, pkl_path)

    X2 = np.array([course_population.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course population counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X2)), repr(X2.shape))

    pkl_path = util.cache_path('course_dropout_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_dropout_count = util.fetch(pkl_path)
    else:
        course_dropout_count = course_population.copy()
        for c, df in D_full[D_full['day_diff'] < 10].groupby(['course_id']):
            course_dropout_count[c] -= len(df['username'].unique())

        util.dump(course_dropout_count, pkl_path)

    X3 = np.array(
        [course_dropout_count.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course dropout counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X3)), repr(X3.shape))

    pkl_path = util.cache_path('user_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_ops_count = util.fetch(pkl_path)
    else:
        user_ops_on_all_courses = D_full.groupby(
            ['username', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        users = []
        for u, df in user_ops_on_all_courses.groupby(['username']):
            params.append(df)
            users.append(u)
        pool = par.Pool(processes=min(n_proc, len(params)))
        user_ops_count = dict(
            zip(users, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(user_ops_count, pkl_path)

    X4 = X0 / [user_ops_count[u] for u in Enroll['username']]
    X4[np.isnan(X4)] = 0

    logger.debug('ratio of user ops on all courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X4)), repr(X4.shape))

    pkl_path = util.cache_path('course_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_ops_count = util.fetch(pkl_path)
    else:
        course_ops_of_all_users = D_full.groupby(
            ['course_id', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        courses = []
        for c, df in course_ops_of_all_users.groupby(['course_id']):
            params.append(df)
            courses.append(c)
        pool = par.Pool(processes=min(n_proc, len(params)))
        course_ops_count = dict(
            zip(courses, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(course_ops_count, pkl_path)

    X5 = X0 / [course_ops_count[c] for c in Enroll['course_id']]
    X5[np.isnan(X5)] = 0

    logger.debug('ratio of courses ops of all users, has nan: %s, shape: %s',
                 np.any(np.isnan(X5)), repr(X5.shape))

    X6 = np.array([
        course_dropout_count.get(c, 0) / course_population.get(c, 1)
        for c in Enroll['course_id']
    ])

    logger.debug('dropout ratio of courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X6)), repr(X6.shape))

    Obj = util.load_object()
    Obj = Obj[Obj['start'] <= base_date]
    course_time = {}
    for c, df in Obj.groupby(['course_id']):
        start_time = np.min(df['start'])
        update_time = np.max(df['start'])
        course_time[c] = [(base_date - start_time).days,
                          (base_date - update_time).days]

    avg_start_days = np.average([t[0] for _, t in course_time.items()])
    avg_update_days = np.average([t[1] for _, t in course_time.items()])
    default_case = [avg_start_days, avg_update_days]

    X7 = np.array(
        [course_time.get(c, default_case)[0] for c in Enroll['course_id']])

    logger.debug('days from course first update, has nan: %s, shape: %s',
                 np.any(np.isnan(X7)), repr(X7.shape))

    X8 = np.array(
        [course_time.get(c, default_case)[1] for c in Enroll['course_id']])

    logger.debug('days from course last update, has nan: %s, shape: %s',
                 np.any(np.isnan(X8)), repr(X8.shape))

    user_ops_time = pd.merge(Enroll, Log, how='left', on=['enrollment_id'])\
        .groupby(['enrollment_id']).agg({'day_diff': [np.min, np.max]})\
        .fillna(0)
    X9 = np.array(user_ops_time['day_diff']['amin'])

    logger.debug('days from user last op, has nan: %s, shape: %s',
                 np.any(np.isnan(X9)), repr(X9.shape))

    X10 = np.array(user_ops_time['day_diff']['amax'])

    logger.debug('days from user first op, has nan: %s, shape: %s',
                 np.any(np.isnan(X10)), repr(X10.shape))

    X11 = X7 - X10

    logger.debug(
        'days from course first update to user first op, has nan: %s'
        ', shape: %s', np.any(np.isnan(X11)), repr(X11.shape))

    X = np.c_[X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11]
    util.dump(X, X_pkl_path)

    return X