def main():
    optparser = OptionParser(usage="""
            %prog [OPTIONS] TRAIN_FILE TEST_FILE""")
    opts, args = optparser.parse_args()

    for session in SessionReader().open(args[0]):
        print session.user_id
Ejemplo n.º 2
0
def main():
    optparser = OptionParser(usage="""
            %prog [OPTIONS] TRAIN_FILE STATISTIC_FILE""")
    optparser.add_option('-d',
                         '--description',
                         dest='description_file',
                         metavar='FILE',
                         type='string',
                         default=None,
                         help='filepath for features description')
    opts, args = optparser.parse_args()

    urls = {}
    for line in open(args[1]):
        s = line.strip().split('\t')
        url = int(s[0])
        ctr = float(s[1])
        shows = int(s[2])
        urls[url] = (ctr, shows)

    url_features_calcer = UrlFeatureCalcer(urls)
    for session in SessionReader().open(args[0]):
        print "\t".join(map(str, url_features_calcer.calc_features(session)))
    if opts.description_file is not None:
        output_file = open(opts.description_file, 'w')
        print >> output_file, "\n".join(
            map(str, url_features_calcer.get_description()))
        output_file.close()
def main():
    optparser = OptionParser(usage="""
            %prog [OPTIONS] DATA_FILE""")
    opts, args = optparser.parse_args()

    data_file = args[0]
    queries_count = {}
    uniq_switch_count = {}
    switch_count = {}
    for session in SessionReader().open(data_file):
        for query in session.queries:
            if query.query_id not in queries_count:
                queries_count[query.query_id] = 0
                uniq_switch_count[query.query_id] = 0
                switch_count[query.query_id] = 0
            queries_count[query.query_id] += 1
            uniq_switch_count[query.query_id] += int(session.has_switch())
            switch_count[query.query_id] += len(session.switches)

    for query_id in queries_count.keys():
        print "\t".join(
            map(str, [
                query_id, queries_count[query_id], uniq_switch_count[query_id],
                switch_count[query_id]
            ]))
Ejemplo n.º 4
0
def main():
    users_info = {}

    for session in SessionReader().open(sys.argv[1]):
        user_id = session.user_id
        if user_id not in users_info:
            users_info[user_id] = UserInfo(user_id)
        users_info[user_id].add(session)
    for user, user_info in users_info.iteritems():
        user_info.flush()
Ejemplo n.º 5
0
def main():
    optparser = OptionParser(usage="""
            %prog [OPTIONS] TRAIN_FILE TEST_FILE""")
    optparser.add_option('-d',
                         '--description',
                         dest='description_file',
                         metavar='FILE',
                         type='string',
                         default=None,
                         help='filepath for features description')
    opts, args = optparser.parse_args()

    session_calcer = SessionFeatureCalcer()
    for session in SessionReader().open(args[0]):
        print "\t".join(map(str, session_calcer.calc_features(session)))
    if opts.description_file is not None:
        output_file = open(opts.description_file, 'w')
        print >> output_file, "\n".join(
            map(str, session_calcer.get_description()))
        output_file.close()
def main():
    optparser = OptionParser(usage="""
            %prog [OPTIONS] DATA_FILE TOP_SIZE RESULT_SIZE COEFFICIENT""")
    opts, args = optparser.parse_args()

    data_file = args[0]
    top_size = int(args[1])
    result_size = int(args[2])
    coefficient = None
    if len(args) == 4:
        coefficient = float(args[3])

    url_in_top = {}
    url_clicks = {}
    for session in SessionReader().open(data_file):
        for query in session.queries:
            for url in query.urls[:top_size]:
                if url not in url_in_top:
                    url_in_top[url] = 0
                    url_clicks[url] = 0
                url_in_top[url] += 1
                for click in query.clicks:
                    if click.url_id == url:
                        url_clicks[url] += 1
                        break

    urls = []
    for url in url_clicks:
        urls.append(
            (url, 1.0 * url_clicks[url] / url_in_top[url], url_in_top[url]))
    urls.sort(key=lambda x: (x[1], x[2]))
    if coefficient is not None:
        urls.sort(
            key=lambda x: (coefficient * x[1] + (1 - coefficient) * x[2]))
    for url_info in urls[-result_size:]:
        print "\t".join(map(str, url_info))