Ejemplo n.º 1
0
def get_post_num(post_num, db_file):
    item = None
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    i = 0
    while True:
        rec = cur.get(False)
        if not rec:
            break

        if i == post_num:
            item = rec

        cur.step()
        i = i + 1

    cur.disable()
    db.close()

    if item is not None:
        return loads(item[1])
    return dict()
Ejemplo n.º 2
0
def get_items(item_filter, db_file, page=0):
    item_iter = 0
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < FILTER_MAX:
        rec = cur.get(False)
        if not rec:
            break

        if item_iter != (FILTER_MAX * page):
            if item_filter(rec):
                item_iter = item_iter + 1
            cur.step_back()
            continue

        if item_filter(rec):
            items.append(rec)

        cur.step_back()
    cur.disable()
    db.close()

    sorted_items = sorted(items, key=get_key, reverse=True)
    sorted_items_for_viewing = [loads(item[1]) for item in sorted_items]
    for item in sorted_items_for_viewing:
        if item['title'] is None or item['title'] == "":
            item['title'] = item['url']
    return sorted_items_for_viewing
Ejemplo n.º 3
0
def aggregate_by_hour(db_file):
    # Initialize the dict with each hour
    hours = {key: 0 for key in range(0,24)}
    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()

    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        unix = float(loaded['created_at'])
        time = datetime.fromtimestamp(unix)

        hours[time.hour] = hours[time.hour] + 1

        cur.step_back()
    cur.disable()
    db.close()

    hours = [{'name': "{}:00".format(key), 'data': [hours[key]]} for key in hours]
    return hours
Ejemplo n.º 4
0
def get_items(item_filter, db_file, page=0):
    item_iter = 0
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < FILTER_MAX:
        rec = cur.get(False)
        if not rec:
            break

        if item_iter != (FILTER_MAX * page):
            if item_filter(rec):
                item_iter = item_iter + 1
            cur.step_back()
            continue

        if item_filter(rec):
            items.append(rec)

        cur.step_back()
    cur.disable()
    db.close()

    sorted_items = sorted(items, key=get_key, reverse=True)
    sorted_items_for_viewing = [loads(item[1]) for item in sorted_items]
    for item in sorted_items_for_viewing:
        if item['title'] is None or item['title'] == "":
            item['title'] = item['url']
    return sorted_items_for_viewing
Ejemplo n.º 5
0
def get_post_num(post_num, db_file):
    item = None
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    i = 0
    while True:
        rec = cur.get(False)
        if not rec:
            break

        if i == post_num:
            item = rec

        cur.step()
        i = i + 1

    cur.disable()
    db.close()

    if item is not None:
        return loads(item[1])
    return dict()
Ejemplo n.º 6
0
def __load_blast_data(blast):
    # Connect to kyoto db
    db = DB()
    if not db.open("/opt/gene2accession/gene2accession.kch", DB.OREADER):
        raise Exception("Could not load gene2accession.kch: " + str(db.error()))

    hits = {}
    gi_num = re.compile('gi\|([0-9]+)')
    for line in blast:
        split_line = line.split('\t')

        # Important data
        evalue = float(split_line[10])

        gi_nums = gi_num.findall(split_line[12])
        genome_ids = [db.get(x) for x in gi_nums if db.get(x) is not None]

        # Thanks to Peter's parser, the gi list and org list are the same
        # length (the first gi column is also the first gi in the "master" gi
        # column)
        for org in genome_ids:
            if org in hits:
                hits[org].append(evalue)
            else:
                hits[org] = [evalue]
    db.close()
    return hits
Ejemplo n.º 7
0
def aggregate_by_hour(db_file):
    # Initialize the dict with each hour
    hours = {key: 0 for key in range(0, 24)}
    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()

    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        unix = float(loaded['created_at'])
        time = datetime.fromtimestamp(unix)

        hours[time.hour] = hours[time.hour] + 1

        cur.step_back()
    cur.disable()
    db.close()

    hours = [{
        'name': "{}:00".format(key),
        'data': [hours[key]]
    } for key in hours]
    return hours
Ejemplo n.º 8
0
def top_things(db_file):
    urls = {}
    people = {}
    graph = {}

    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database. (Top things)"

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        split = get_domain(loaded_rec)

        if urls.get(split, False) == False:
            urls[split] = 1
        else:
            urls[split] = urls[split] + 1

        person = loaded_rec['person']
        if people.get(person, False) == False:
            people[person] = 1
        else:
            people[person] = people[person] + 1

        if split is not None and split is not "" and \
            person is not None and person is not "":
            # Build a crazy relational graph out of my nosql data
            if graph.get(split, False) == False:
                graph[split] = {
                    "is_person": False,
                    "data": [person],
                    "linked_to_count": 1
                }
            elif person not in graph[split]:
                graph[split]["data"].append(person)
                graph[split][
                    "linked_to_count"] = graph[split]["linked_to_count"] + 1

            if graph.get(person, False) == False:
                graph[person] = {"is_person": True, "data": [split]}
            elif split not in graph[person]:
                graph[person]["data"].append(split)

        cur.step_back()
    cur.disable()
    db.close()

    def get_one(x):
        return x[1]

    return (sorted(urls.items(), key=get_one, reverse=True),
            sorted(people.items(), key=get_one, reverse=True), graph)
Ejemplo n.º 9
0
def insert_item(url, person, db_file, submitted_title=''):
    mimetype = "application/json"
    db = DB()

    if not db.open("{0}".format(db_file), DB.OWRITER | DB.OCREATE):

        response = {}
        response[
            'What happened?'] = "Couldn't open the damn database. Error: {0}".format(
                db.error())
        return Response(dumps(response), mimetype=mimetype)

    if is_url_in_db(db, url):
        return Response('{"What happened?": "Someone '\
            'tried to submit a duplicate URL."}',
            mimetype=mimetype)

    title = url
    summary = "~?~"
    try:
        thing = urlopen(url, timeout=10)
        soup = BeautifulSoup(thing)
        title = soup.title.string

        # Do some dumb summarizing if we can
        def concat(a, v):
            return a + " " + v.strip()

        visible_stuff = filter(visible, soup.findAll(text=True))
        summary = reduce(concat, visible_stuff, "")[:900] + "..."
    except:
        pass
        #return Response('{"What happened?": '\
        #    'I dunno bs4 messed up somehow."}',
        #    mimetype=mimetype)

    created_at = int(mktime(datetime.now().utctimetuple()))

    is_image = url.lower().endswith(("jpg", "jpeg", "gif", "png"))
    thumbnail = gen_thumbnail_for_url(url, str(created_at))

    record = {
        "created_at": created_at,
        "title": title,
        "url": url,
        "person": person,
        "summary": summary,
        "person_color": PERSON_COLORS[random.randint(0,
                                                     len(PERSON_COLORS) - 1)],
        "is_image": is_image,
        "thumbnail": thumbnail,
        "comment": submitted_title
    }
    db.set(created_at, dumps(record))
    db.close()

    return Response('{"What happened?": "MUDADA"}', mimetype=mimetype)
Ejemplo n.º 10
0
def insert_item(url, person, db_file, submitted_title=''):
    mimetype = "application/json"
    db = DB()

    if not db.open("{0}".format(db_file),
        DB.OWRITER | DB.OCREATE):

        response = {}
        response['What happened?'] = "Couldn't open the damn database. Error: {0}".format(db.error())
        return Response(dumps(response), mimetype=mimetype)

    if is_url_in_db(db, url):
        return Response('{"What happened?": "Someone '\
            'tried to submit a duplicate URL."}',
            mimetype=mimetype)

    title = url
    summary = "~?~"
    try:
        thing = urlopen(url, timeout=10)
        soup = BeautifulSoup(thing)
        title = ''
        if len(submitted_title) > 0:
            title = submitted_title
        else:
            title = soup.title.string
        # Do some dumb summarizing if we can
        func = lambda a,v: a + " " + v.strip()
        visible_stuff = filter(visible, soup.findAll(text=True))
        summary = reduce(func, visible_stuff, "")[:900] + "..."
    except:
        pass
        #return Response('{"What happened?": '\
        #    'I dunno bs4 messed up somehow."}',
        #    mimetype=mimetype)

    created_at = int(mktime(datetime.now().utctimetuple()))

    is_image = url.lower().endswith(("jpg", "jpeg", "gif", "png"))
    thumbnail = gen_thumbnail_for_url(url, str(created_at))

    record = {
        "created_at": created_at,
        "title": title,
        "url": url,
        "person": person,
        "summary": summary,
        "person_color": PERSON_COLORS[random.randint(0, len(PERSON_COLORS)-1)],
        "is_image": is_image,
        "thumbnail": thumbnail
    }
    db.set(created_at, dumps(record))
    db.close()

    return Response('{"What happened?": "MUDADA"}',
        mimetype=mimetype)
Ejemplo n.º 11
0
def top_things(db_file):
    urls = {}
    people = {}
    graph = {}

    db = DB()

    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database. (Top things)"

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        split = get_domain(loaded_rec)

        if urls.get(split, False) == False:
            urls[split] = 1
        else:
            urls[split] = urls[split] + 1

        person = loaded_rec['person']
        if people.get(person, False) == False:
            people[person] = 1
        else:
            people[person] = people[person] + 1

        if split is not None and split is not "" and \
            person is not None and person is not "":
            # Build a crazy relational graph out of my nosql data
            if graph.get(split, False) == False:
                graph[split] = {"is_person": False, "data": [person], "linked_to_count": 1}
            elif person not in graph[split]:
                graph[split]["data"].append(person)
                graph[split]["linked_to_count"] = graph[split]["linked_to_count"] + 1

            if graph.get(person, False) == False:
                graph[person] = {"is_person": True, "data": [split]}
            elif split not in graph[person]:
                graph[person]["data"].append(split)

        cur.step_back()
    cur.disable()
    db.close()

    def get_one(x):
        return x[1]

    return (sorted(urls.items(), key=get_one, reverse=True),
            sorted(people.items(), key=get_one, reverse=True),
            graph)
Ejemplo n.º 12
0
def db_meta_info():
    meta = {}
    db = DB()
    db_file = current_app.config['DB_FILE']
    if not db.open("{0}".format(db_file), DB.OREADER):
        print "Could not open database (meta info)."
    meta["size"] = db.size()
    meta["count"] = db.count()
    db.close()

    return meta
Ejemplo n.º 13
0
def get_post_by_date(key, db_file):
    item = None
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."
    item = db.get(key)

    db.close()
    if item is not None:
        return loads(item)
    return dict()
Ejemplo n.º 14
0
def db_meta_info():
    meta = {}
    db = DB()
    db_file = current_app.config['DB_FILE']
    if not db.open("{0}".format(db_file), DB.OREADER):
        print "Could not open database (meta info)."
    meta["size"] = db.size()
    meta["count"] = db.count()
    db.close()

    return meta
Ejemplo n.º 15
0
def get_post_by_date(key, db_file):
    item = None
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."
    item = db.get(key)

    db.close()
    if item is not None:
        return loads(item)
    return dict()
Ejemplo n.º 16
0
def purge(domain, genid):
    if request.remote_addr not in settings.ALLOW:
        return text_response("Not permitted.\n", 403)

    db = DB()

    if not db.open(settings.GENID_DATABASE, DB.OWRITER | DB.OCREATE):
        return text_response("Failed to purge: cannot open database.\n", 501)

    set_ok = db.set(domain, genid)
    db.close()

    if not set_ok:
        return text_response("Failed to purge: cannot set genid.\n", 501)
    else:
        return text_response("Purged <%s>\n" % (domain,))
Ejemplo n.º 17
0
    def decorated_function(*args, **kwargs):
        # Debug
        if not current_app.config['CACHE']:
            return f(*args, **kwargs)

        db = DB()
        db.open("/tmp/page_cache.kch")
        res = None
        fancy = hash("{}{}{}".format(db_meta_info()['count'], request.url, f.func_name))

        res = db.get(fancy)
        if not res:
            res = f(*args, **kwargs)
            db.set(fancy, res)

        db.close()
        return res
Ejemplo n.º 18
0
def get_items_last_X_days(db_file, X, munge=True):
    dates = {}
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    X_days_ago = datetime.now() - timedelta(days=X)

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        unix = float(loaded['created_at'])
        time = datetime.fromtimestamp(unix)

        if time > X_days_ago:
            if munge:
                date_obj = date(year=time.year, month=time.month, day=time.day)
            else:
                date_obj = time
            # Javascript expects Date.UTC to spit out dates of a certain
            # length.
            day_unix = int(mktime(date_obj.timetuple())) * 1000
            if dates.get(day_unix, False) == False:
                dates[day_unix] = {loaded["person"]: 1}
            else:
                relevant_dict = dates[day_unix]

                if relevant_dict.get(loaded["person"], False) == False:
                    relevant_dict[loaded["person"]] = 1
                else:
                    relevant_dict[
                        loaded["person"]] = relevant_dict[loaded["person"]] + 1
        else:
            break

        cur.step_back()
    cur.disable()
    db.close()

    return dates
Ejemplo n.º 19
0
def get_items_last_X_days(db_file, X, munge=True):
    dates = {}
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    X_days_ago = datetime.now() - timedelta(days=X)

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        unix = float(loaded['created_at'])
        time = datetime.fromtimestamp(unix)

        if time > X_days_ago:
            if munge:
                date_obj = date(year=time.year, month=time.month, day=time.day)
            else:
                date_obj = time
            # Javascript expects Date.UTC to spit out dates of a certain
            # length.
            day_unix = int(mktime(date_obj.timetuple()))*1000
            if dates.get(day_unix, False) == False:
                dates[day_unix] = {loaded["person"]: 1}
            else:
                relevant_dict = dates[day_unix]

                if relevant_dict.get(loaded["person"], False) == False:
                    relevant_dict[loaded["person"]] = 1
                else:
                    relevant_dict[loaded["person"]] = relevant_dict[loaded["person"]] + 1
        else:
            break;

        cur.step_back()
    cur.disable()
    db.close()

    return dates
Ejemplo n.º 20
0
def get_last_items(db_file, pages=1):
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < (pages * FILTER_MAX):
        rec = cur.get(False)
        if not rec:
            break

        items.append(rec)
        cur.step_back()
    cur.disable()
    db.close()

    return items
Ejemplo n.º 21
0
def get_last_items(db_file, pages=1):
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < (pages * FILTER_MAX):
        rec = cur.get(False)
        if not rec:
            break

        items.append(rec)
        cur.step_back()
    cur.disable()
    db.close()

    return items
Ejemplo n.º 22
0
def get_all_items(db_file):
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break
        items.append(rec)
        cur.step()

    cur.disable()
    db.close()

    sorted_items_for_viewing = [loads(item[1]) for item in items]
    return sorted_items_for_viewing
Ejemplo n.º 23
0
def get_all_items(db_file):
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break
        items.append(rec)
        cur.step()

    cur.disable()
    db.close()

    sorted_items_for_viewing = [loads(item[1]) for item in items]
    return sorted_items_for_viewing
Ejemplo n.º 24
0
def main():
    if len(sys.argv) < 3:
        sys.stderr.write('Usage: %s outdir textfile1 textfile2 ...\n'
                         % sys.argv[0])
        sys.exit(1)
    outdir = sys.argv[1]
    tfdb = DB()
    if not tfdb.open(os.path.join(outdir, 'tf.kch'),
                     DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open tfdb: %s\n' % str(tfdb.error))
        sys.exit(1)
    dfdb = DB()
    if not dfdb.open(os.path.join(outdir, 'df.kch'),
                     DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open dfdb: %s\n' % str(dfdb.error))
        sys.exit(1)
    tfidfdb = DB()
    if not tfidfdb.open(os.path.join(outdir, 'tfidf.kch'),
                     DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open tfidfdb: %s\n' % str(tfidfdb.error))
        sys.exit(1)

    print 'Count words ...'
    for i in range(len(sys.argv)-2):
        filename = sys.argv[i+2]
        print '(%d/%d) %s' % (i+1, len(sys.argv)-2, filename)
        count_words(tfdb, dfdb, filename)
    print 'Calculate TFIDF ...'
    save_tfidf(tfdb, dfdb, tfidfdb)

    tfdb.close()
    dfdb.close()
    tfidfdb.close()
Ejemplo n.º 25
0
def main():
    if len(sys.argv) < 3:
        sys.stderr.write('Usage: %s outdir textfile1 textfile2 ...\n' %
                         sys.argv[0])
        sys.exit(1)
    outdir = sys.argv[1]
    tfdb = DB()
    if not tfdb.open(os.path.join(outdir, 'tf.kch'),
                     DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open tfdb: %s\n' % str(tfdb.error))
        sys.exit(1)
    dfdb = DB()
    if not dfdb.open(os.path.join(outdir, 'df.kch'),
                     DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open dfdb: %s\n' % str(dfdb.error))
        sys.exit(1)
    tfidfdb = DB()
    if not tfidfdb.open(os.path.join(outdir, 'tfidf.kch'),
                        DB.OWRITER | DB.OCREATE | DB.OTRUNCATE):
        sys.stderr.write('cannot open tfidfdb: %s\n' % str(tfidfdb.error))
        sys.exit(1)

    print 'Count words ...'
    for i in range(len(sys.argv) - 2):
        filename = sys.argv[i + 2]
        print '(%d/%d) %s' % (i + 1, len(sys.argv) - 2, filename)
        count_words(tfdb, dfdb, filename)
    print 'Calculate TFIDF ...'
    save_tfidf(tfdb, dfdb, tfidfdb)

    tfdb.close()
    dfdb.close()
    tfidfdb.close()
Ejemplo n.º 26
0
def gen_thumbnails(db_file):
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER):
        sys.exit(1)

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)

        if not rec:
            break

        loaded = loads(rec[1])
        is_image = loaded["url"].lower().endswith(("jpg", "jpeg", "gif", "png"))

        if is_image:
            print "Thumbnailing {}".format(loaded["url"])
            loaded["is_image"] = True
            try:
                thumbnail = gen_thumbnail_for_url(loaded["url"], rec[0])
            except IOError as e:
                print "IOError: {}".format(e)
                print "Save result: {}".format(cur.set_value(dumps(loaded)))
                cur.step_back()
                continue


            if thumbnail:
                loaded["thumbnail"] = thumbnail
                print "Thumbnailed {}".format(loaded["url"])
                print "Save result: {}".format(cur.set_value(dumps(loaded)))

        cur.step_back()

    cur.disable()
    db.close()

    return True
Ejemplo n.º 27
0
def get_page_count(item_filter = lambda x: True):
    count = 0
    db = DB()
    db_file = current_app.config['DB_FILE']
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER | DB.OCREATE):
        print "Could not open database (get_page_count). Error: {}".format(db.error())

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        if item_filter(rec):
            count = count + 1

        cur.step_back()

    cur.disable()
    db.close()
    return count / FILTER_MAX
Ejemplo n.º 28
0
def gen_thumbnails(db_file):
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OWRITER):
        sys.exit(1)

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)

        if not rec:
            break

        loaded = loads(rec[1])
        is_image = loaded["url"].lower().endswith(
            ("jpg", "jpeg", "gif", "png"))

        if is_image:
            print "Thumbnailing {}".format(loaded["url"])
            loaded["is_image"] = True
            try:
                thumbnail = gen_thumbnail_for_url(loaded["url"], rec[0])
            except IOError as e:
                print "IOError: {}".format(e)
                print "Save result: {}".format(cur.set_value(dumps(loaded)))
                cur.step_back()
                continue

            if thumbnail:
                loaded["thumbnail"] = thumbnail
                print "Thumbnailed {}".format(loaded["url"])
                print "Save result: {}".format(cur.set_value(dumps(loaded)))

        cur.step_back()

    cur.disable()
    db.close()

    return True
Ejemplo n.º 29
0
def get_items_on_page(page, db_file):
    item_iter = 0
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < FILTER_MAX:
        rec = cur.get(False)
        if not rec:
            break

        if item_iter >= (FILTER_MAX * page):
            items.append(rec)

        item_iter = item_iter + 1
        cur.step_back()
    cur.disable()
    db.close()

    return items
Ejemplo n.º 30
0
def get_items_on_page(page, db_file):
    item_iter = 0
    items = []
    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump_back()
    while len(items) < FILTER_MAX:
        rec = cur.get(False)
        if not rec:
            break

        if item_iter >= (FILTER_MAX * page):
            items.append(rec)

        item_iter = item_iter + 1
        cur.step_back()
    cur.disable()
    db.close()

    return items
Ejemplo n.º 31
0
def main():
    db_file = argv[1]
    username = argv[2]

    if not db_file and not username:
        print "Need db_file and username."
        return -1

    db = DB()
    if not db.open("{0}".format(db_file), DB.OWRITER):
        print "Could not open database."
        return -1

    all_keys = []
    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        if loaded["person"] == username:
            all_keys.append(cur.get_key())

        cur.step()
    cur.disable()

    print "Found {} records.".format(len(all_keys))
    for key in all_keys:
        print "Pending {}...".format(key)
        if len(argv) > 3 and argv[3] == '--delete':
            print "Removing {}...".format(key)
            if not db.remove(key):
                print "Could not remove key: {}".format(db.error())

    db.close()
Ejemplo n.º 32
0
def main():
    db_file = argv[1]
    username = argv[2]

    if not db_file and not username:
        print "Need db_file and username."
        return -1

    db = DB()
    if not db.open("{0}".format(db_file), DB.OWRITER):
        print "Could not open database."
        return -1

    all_keys = []
    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded = loads(rec[1])
        if loaded["person"] == username:
            all_keys.append(cur.get_key())

        cur.step()
    cur.disable()

    print "Found {} records.".format(len(all_keys))
    for key in all_keys:
        print "Pending {}...".format(key)
        if len(argv) > 3 and argv[3] == '--delete':
            print "Removing {}...".format(key)
            if not db.remove(key):
                print "Could not remove key: {}".format(db.error())

    db.close()
Ejemplo n.º 33
0
def get_page_count(item_filter=lambda x: True):
    count = 0
    db = DB()
    db_file = current_app.config['DB_FILE']
    if not db.open("{0}".format(db_file),
                   DB.OREADER | DB.OWRITER | DB.OCREATE):
        print "Could not open database (get_page_count). Error: {}".format(
            db.error())

    cur = db.cursor()
    cur.jump_back()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        if item_filter(rec):
            count = count + 1

        cur.step_back()

    cur.disable()
    db.close()
    return count / FILTER_MAX
Ejemplo n.º 34
0
class KyotoIndex(BinaryIndex):
  ''' Kyoto Cabinet index.
      Notably this uses a B+ tree for the index and thus one can
      traverse from one key forwards and backwards, which supports
      the coming Store synchronisation processes.
  '''

  NAME = 'kyoto'
  SUFFIX = 'kct'

  def __init__(self, nmdbpathbase):
    super().__init__(nmdbpathbase)
    self._kyoto = None

  @classmethod
  def is_supported(cls):
    ''' Test whether this index class is supported by the Python environment.
    '''
    # pylint: disable=import-error,unused-import,import-outside-toplevel
    try:
      import kyotocabinet
    except ImportError:
      return False
    return True

  def startup(self):
    ''' Open the index.
    '''
    # pylint: disable=import-error,import-outside-toplevel
    from kyotocabinet import DB
    self._kyoto = DB()
    self._kyoto.open(self.path, DB.OWRITER | DB.OCREATE)

  def shutdown(self):
    ''' Close the index.
    '''
    self._kyoto.close()
    self._kyoto = None

  def flush(self):
    ''' Flush pending updates to the index.
    '''
    try:
      self._kyoto.synchronize(hard=False)
    except TypeError:
      self._kyoto.synchronize()

  def __len__(self):
    return self._kyoto.count()

  def __contains__(self, key):
    return self._kyoto.check(key) >= 0

  def __getitem__(self, key):
    binary_entry = self._kyoto.get(key)
    if binary_entry is None:
      raise KeyError(key)
    return binary_entry

  def __setitem__(self, key, binary_entry):
    self._kyoto[key] = binary_entry

  def keys(self, *, start_hashcode=None):
    ''' Generator yielding the keys from the index
        in order starting with optional `start_hashcode`.

        Parameters:
        * `start_hashcode`: the starting key; if missing or `None`,
          iteration starts with the first key in the index
    '''
    cursor = self._kyoto.cursor()
    if start_hashcode is not None:
      cursor.jump(start_hashcode)
    yield cursor.get_key()
    while cursor.step():
      yield cursor.get_key()
    cursor.disable()

  sorted_keys = keys

  __iter__ = keys
class BKNNModel( Model ):


  def __init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel, kval ):

    Model.__init__( self, fn, mode, catfe, binfe, contfe, fdisc, fsel );

    self._kval = kval;

    self._fn_cdata = self._fn;
    self._fn_ddata = self._fn.replace( '.kch', '-discrete.kch' );
    self._fn_meta = self._fn.replace( '.kch', '-meta.pickle' );
    self._fn_icov = self._fn.replace( '.kch', '-icov.pickle' );

    self._cdata = None;
    self._ddata = None;

    self._len_c = None;
    self._len_b = None;
    self._len_x = None;

    self._rowcount = None;
    self._total_pos = None;
    self._total_neg = None;

    self._icov = None;
    self._co = None;

    self._sample_y = [];
    self._sample_c = [];
    self._sample_b = [];
    self._sample_x = [];
    self._sample_x_ = [];

    self._needs_finalization = False;
    self._needs_initialization = True;

    self._dmarginals = {};
    self._dscores = {};

    self._sparse_points = 0;

    self._bias = None;


  def __enter__( self ):

    self._cdata = DB();
    self._ddata = DB();

    try:
      if self._mode == "r":
        assert self._cdata.open( self._fn_cdata, DB.OREADER );
      elif self._mode == "w":
        if isfile( self._fn_cdata ):
          remove( self._fn_cdata );
        assert self._cdata.open( self._fn_cdata, DB.OWRITER | DB.OCREATE );
      else:
        assert False;
    except:
      if self._cdata is not None:
        print( str( self._cdata.error() ) );
      raise;

    try:
      if self._mode == "r":
        assert self._ddata.open( self._fn_ddata, DB.OREADER );
      elif self._mode == "w":
        if isfile( self._fn_ddata ):
          remove( self._fn_ddata );
        assert self._ddata.open( self._fn_ddata, DB.OWRITER | DB.OCREATE );
      else:
        assert False;
    except:
      if self._ddata is not None:
        print( str( self._ddata.error() ) );
      raise;

    if self._mode == "r":

      with open( self._fn_meta, 'rb' ) as f:
        r = pickle_load( f );
        self._len_c = r[ "c" ];
        self._len_b = r[ "b" ];
        self._len_x = r[ "x" ];
        self._co = r[ "co" ];

      with open( self._fn_icov, 'rb' ) as f:
        self._icov = pickle_load( f );

    return self;


  def __exit__( self, exc_type, exc_value, traceback ):

    ex_w_exc = False;
    ex_w_exc = ex_w_exc or ( exc_type is not None );
    ex_w_exc = ex_w_exc or ( exc_value is not None );
    ex_w_exc = ex_w_exc or ( traceback is not None );

    if ( not ex_w_exc ) and ( self._mode == "w" ):

      if self._needs_finalization:
        self._finalize();

      with open( self._fn_meta, 'wb' ) as f:

        r = { "c": self._len_c,
              "b": self._len_b,
              "x": self._len_x,
              "co": self._co };

        pickle_dump( r, f );

      with open( self._fn_icov, 'wb' ) as f:

        pickle_dump( self._icov, f );

    if self._cdata is not None:
      try:
        assert self._cdata.close();
      except:
        print( str( self._cdata.error() ) );
        raise;
      self._cdata = None;

    if self._ddata is not None:
      try:
        assert self._ddata.close();
      except:
        print( str( self._ddata.error() ) );
        raise;
      self._ddata = None;

    if ex_w_exc and ( self._mode == "w" ):

      if isfile( self._fn_cdata ):
        remove( self._fn_cdata );

      if isfile( self._fn_ddata ):
        remove( self._fn_ddata );

      if isfile( self._fn_meta ):
        remove( self._fn_meta );

      if isfile( self._fn_icov ):
        remove( self._fn_icov );

    return False;


  def train( self, row ):

    self._needs_finalization = True;

    ( y, c, b, x ) = row;

    c = self._fsel.apply_c( self._catfe( c ) );
    b = self._fsel.apply_b( self._binfe( b ) );

    x = self._contfe( x );
    x_ = self._fdisc( x );

    x = self._fsel.apply_x( x );
    x_ = self._fsel.apply_x( x_ );

    if False:
      print( y, c, b, x, x_ );

    if self._len_c is None:
      self._len_c = len(c);
    assert self._len_c == len(c);

    if self._len_b is None:
      self._len_b = len(b);
    assert self._len_b == len(b);

    if self._len_x is None:
      self._len_x = len(x);
    assert self._len_x == len(x);

    if self._rowcount is None:
      self._rowcount = 0;

    self._rowcount += 1;

    dkeyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) );
    self._ddata.increment( pack( dkeyfmt, y, *(c+b) ), 1, 0 );

    ckeyfmt = '>' + ( 'I' * len(x) );
    cvalfmt = '>I' + ( 'f' * len(x) );
    self._cdata.append( pack( ckeyfmt, *x_ ), pack( cvalfmt, y, *x ) );

    if len( self._sample_x ) < 50000:

      assert len( self._sample_x ) == len( self._sample_y );
      assert len( self._sample_x ) == len( self._sample_c );
      assert len( self._sample_x ) == len( self._sample_b );
      assert len( self._sample_x ) == len( self._sample_x_ );

      self._sample_y.append( y );
      self._sample_c.append( c );
      self._sample_b.append( b );
      self._sample_x.append( x );
      self._sample_x_.append( x_ );

    return False;


  def _init( self ):

    self._needs_initialization = False;

    c = self._ddata.cursor();
    c.jump();

    keyfmt = '>' + ( 'I' * ( 1 + self._len_c + self._len_b ) );
    valfmt = '>Q';


    while True:

      r = c.get( True );
      if not r:
        break;

      dbkey = unpack( keyfmt, r[0] );
      dbval = unpack( valfmt, r[1] )[ 0 ];

      additional_count = dbval;

      y = dbkey[ 0 ];

      for ( i, value_of_variable_i ) in enumerate( dbkey[ 1: ] ):

        if not i in self._dmarginals:
          self._dmarginals[ i ] = {};

        self._dmarginals[ i ][ (y,value_of_variable_i) ] \
          = self._dmarginals[ i ].get( (y,value_of_variable_i), 0 ) \
              + additional_count;


    for ( i, count_by_val ) in self._dmarginals.items():

      total = 0;
      total_neg = 0;
      total_pos = 0;

      for ( ( y, val ), cnt ) in count_by_val.items():
        total += cnt;
        if y == 0:
          total_neg += cnt;
        elif y == 1:
          total_pos += cnt;

      if self._rowcount is None:
        self._rowcount = total;
      assert self._rowcount == total;

      if self._total_neg is None:
        self._total_neg = total_neg;
      try:
        assert self._total_neg == total_neg;
      except: 
        print( self._total_neg, total_neg );
        raise;

      if self._total_pos is None:
        self._total_pos = total_pos;
      try:
        assert self._total_pos == total_pos;
      except: 
        print( self._total_pos, total_pos );
        raise;

    assert ( self._total_pos + self._total_neg ) == self._rowcount;


    for i in self._dmarginals:

      values = set([ val for (y,val) in self._dmarginals[ i ].keys() ]);

      if i not in self._dscores:
        self._dscores[ i ] = {};

      for val in values:

        pos_cnt = self._dmarginals[ i ].get( (1,val), 0 );
        neg_cnt = self._dmarginals[ i ].get( (0,val), 0 );

        p_pos \
          =   log( float(pos_cnt) + SMOOTHING, 2.0 ) \
            - log( float(self._total_pos) + float( len(values) ) * SMOOTHING, 2.0 );

        p_neg \
          =   log( float(neg_cnt) + SMOOTHING, 2.0 ) \
            - log( float(self._total_neg) + float( len(values) ) * SMOOTHING, 2.0 );

        self._dscores[ i ][ val ] = p_pos - p_neg;

    
    p_pos \
      =   log( float(self._total_pos), 2.0 ) \
        - log( float(self._rowcount), 2.0 );

    p_neg \
      =   log( float(self._total_neg), 2.0 ) \
        - log( float(self._rowcount), 2.0 );

    self._bias = p_pos - p_neg;


    if False:
      for i in sorted( self._dscores.keys() ):
        score_by_val = self._dscores[ i ];
        for ( val, score ) in score_by_val.items():
          print( "{:d} {:10d} {:+2.4f}".format( i, val, score ) );


  def _apply( self, row ):

    if self._needs_initialization:
      self._init();

    ( c, b, x, x_ ) = row;

    ckeyfmt = '>' + ( 'I' * len(x_) );
    cvalfmt = '>I' + ( 'f' * len(x) );
    cvalsz = calcsize( cvalfmt );

    rng = [];
    for xval in x_:
      rng.append(
          [ xv \
              for xv \
               in [ xval-2, xval-1, xval, xval+1, xval+2 ] \
               if 0 <= xv <= 31 ]
        );

    x_vec = np.array( x ).reshape( 1, self._len_x ).T;

    nearest_positive = [];
    all_negative = [];
    found_ident = 0;

    for xvals in product( *rng ):

      try:
        ckey = pack( ckeyfmt, *xvals );
      except:
        print( ckeyfmt, xvals );
        raise;
      val = self._cdata.get( ckey );

      while val:

        if len(val) <= cvalsz:
          assert len(val) == cvalsz;

        val_ = val[:cvalsz];
        val = val[cvalsz:];

        pt = unpack( cvalfmt, val_ );
        pt_y = pt[0];
        pt_x = pt[1:];

        pt_x_vec = np.array( pt_x ).reshape( 1, self._len_x ).T;
        diff = pt_x_vec - x_vec;
        dist = np.sqrt( np.dot( np.dot( diff.T, self._icov ), diff ) );

        if dist <= 0.0001:
          found_ident += 1;
          continue;

        if pt_y == 0:
          all_negative.append( dist );
          continue;

        assert pt_y == 1;

        nearest_positive.append( dist );
        nearest_positive.sort();
        nearest_positive = nearest_positive[:self._kval];

    # assert found_ident == 1;
    # assert len( nearest_positive ) == self._kval;
    if len( nearest_positive ) < self._kval:
      self._sparse_points += 1;

    score = self._bias;

    # if len( nearest_positive ) > 0:
    if True:

      if len( nearest_positive ) == 0:
        threshold = None;
      else:
        threshold = nearest_positive[-1];

      neg_cnt = 0;
      for dist in all_negative:
        if ( threshold is None ) or ( dist <= threshold ):
          neg_cnt += 1;

      p_pos \
        =   log( float( len(nearest_positive) ) + SMOOTHING, 2.0 ) \
          - log( float(self._total_pos) + 2.0 * SMOOTHING, 2.0 );

      p_neg \
        =   log( float(neg_cnt) + SMOOTHING, 2.0 ) \
          - log( float(self._total_neg) + 2.0 * SMOOTHING, 2.0 );

      score += p_pos - p_neg;

    for ( i, dval ) in enumerate( c+b ):
      score += self._dscores[ i ].get( dval, 0.0 );

    if self._co is None:
      return score;
    else:
      if score >= self._co:
        return 1;
      else:
        return 0;


  def _finalize( self ):

    self._needs_finalization = False;

    covsample = np.array( self._sample_x );
    cov = np.cov( covsample.T );
    self._icov = LA.inv( cov );

    sample \
      = zip(
            self._sample_c,
            self._sample_b,
            self._sample_x,
            self._sample_x_
          );

    scores = [];
    for ( c, b, x, x_ ) in sample:
      scores.append( self._apply( [ c, b, x, x_ ] ) );

    sorted_scores = list( sorted( scores ) );

    cutoffs = [];
    for idx in range(0,1000):
      ratio = float(idx) / 1000.0;
      cutoffs.append(
          sorted_scores[ int( float( len(sorted_scores) ) * ratio ) ]
        );

    if False:
      pprint( cutoffs );

    stats_by_co = [];
    for coidx in range( 0, len(cutoffs) ):
      stats_by_co.append( { "tp": 0, "fp": 0, "tn": 0, "fn": 0 } );

    for ( y, score ) in zip( self._sample_y, scores ):
      for ( coidx, co ) in enumerate( cutoffs ):
        if score >= co:
          if y == 1:
            stats_by_co[ coidx ][ "tp" ] += 1;
          else:
            assert y == 0;
            stats_by_co[ coidx ][ "fp" ] += 1;
        else:
          if y == 0:
            stats_by_co[ coidx ][ "tn" ] += 1;
          else:
            assert y == 1;
            stats_by_co[ coidx ][ "fn" ] += 1;

    max_fscore = None;
    max_fscore_coidx = None;
    
    for ( coidx, co ) in enumerate( cutoffs ):

      tp = stats_by_co[ coidx ][ "tp" ];
      fp = stats_by_co[ coidx ][ "fp" ];
      tn = stats_by_co[ coidx ][ "tn" ];
      fn = stats_by_co[ coidx ][ "fn" ];

      if (tp+fp) <= 0:
        continue;

      if (tp+fn) <= 0:
        continue;

      precision = float(tp) / float(tp+fp);
      recall = float(tp) / float(tp+fn);

      if (precision+recall) <= 0.0:
        continue;

      fscore = 2.0 * ( ( precision * recall ) / ( precision + recall ) );

      if ( max_fscore is None ) or ( fscore > max_fscore ):

        max_fscore = fscore;
        max_fscore_coidx = coidx;

    assert max_fscore_coidx is not None;
    self._co = cutoffs[ max_fscore_coidx ];

    # assert self._sparse_points == 0;

    if True:
      print( self._sparse_points );
      print( self._co );
      print( max_fscore );


  def __call__( self, row ):

    ( c, b, x ) = row;

    c = self._fsel.apply_c( self._catfe( c ) );
    b = self._fsel.apply_b( self._binfe( b ) );

    x = self._contfe( x );
    x_ = self._fdisc( x );

    x = self._fsel.apply_x( x );
    x_ = self._fsel.apply_x( x_ );

    try:
      assert self._len_c == len(c);
      assert self._len_b == len(b);
      assert self._len_x == len(x);
      assert self._len_x == len(x_);
    except:
      print( self._len_c, self._len_b, self._len_x );
      raise;

    return self._apply( ( c, b, x, x_ ) );
Ejemplo n.º 36
0
class DataStorage(object):
    """
    Parent class for RowData and KeyValueData.
    """

    def __init__(self, filename, headers=None):
        self.filename = filename
        self.ext = os.path.splitext(filename)[1]
        self.headers = headers

        if os.path.exists(self.filename):
            self.init_read()
        else:
            self.init_write()

    def init_write(self):
        self.mode = "write"

        if self.ext == ".csv":
            self._data_file = open(self.filename, "wb")
            self._writer = csv.writer(self._data_file)
            if self.headers:
                self._writer.writerow(self.headers)

        elif self.ext == ".json":
            self._storage = {}

        elif self.ext == ".kch":
            from kyotocabinet import DB

            self._storage = DB()
            if not self._storage.open(self.filename, DB.OWRITER | DB.OCREATE):
                msg = "Error opening kyotocabinet db: %s" % (self._storage.error())
                raise dexy.commands.UserFeedback(msg)

        elif self.ext == ".sqlite3":
            self.init_write_sqlite3()

        else:
            raise dexy.commands.UserFeedback("unsupported extension %s" % self.ext)

    def init_read(self):
        self.mode = "read"

        if self.ext == ".csv":
            self._file = open(self.filename, "rb")
        elif self.ext == ".json":
            with open(self.filename, "rb") as f:
                self._storage = json.load(f)
        elif self.ext == ".kch":
            from kyotocabinet import DB

            self._storage = DB()
            self._storage.open(self.filename, DB.OREADER)
        elif self.ext == ".sqlite3":
            import sqlite3

            self._storage = sqlite3.connect(self.filename)
            self._cursor = self._storage.cursor()
        else:
            raise dexy.commands.UserFeedback("unsupported extension %s" % self.ext)

    def save(self):
        if self.ext == ".csv":
            self._data_file.close()
        elif self.ext == ".json":
            with open(self.filename, "wb") as f:
                import json

                json.dump(self._storage, f)
        elif self.ext == ".kch":
            if not self._storage.close():
                raise dexy.commands.UserFeedback(self._storage.error())
        elif self.ext == ".sqlite3":
            self._storage.commit()
            self._cursor.close()
        else:
            raise dexy.commands.UserFeedback("unsupported extension %s" % self.ext)
Ejemplo n.º 37
0
class DataStorage(object):
    """
    Parent class for RowData and KeyValueData.
    """
    def __init__(self, filename, headers=None):
        self.filename = filename
        self.ext = os.path.splitext(filename)[1]
        self.headers = headers

        if os.path.exists(self.filename):
            self.init_read()
        else:
            self.init_write()

    def init_write(self):
        self.mode = "write"

        if self.ext == ".csv":
            self._data_file = open(self.filename, "wb")
            self._writer = csv.writer(self._data_file)
            if self.headers:
                self._writer.writerow(self.headers)

        elif self.ext == ".json":
            self._storage = {}

        elif self.ext == ".kch":
            from kyotocabinet import DB
            self._storage = DB()
            if not self._storage.open(self.filename, DB.OWRITER | DB.OCREATE):
                msg = "Error opening kyotocabinet db: %s" % (
                    self._storage.error())
                raise dexy.exceptions.UserFeedback(msg)

        elif self.ext == ".sqlite3":
            self.init_write_sqlite3()

        else:
            raise dexy.exceptions.UserFeedback("unsupported extension %s" %
                                               self.ext)

    def init_read(self):
        self.mode = "read"

        if self.ext == ".csv":
            self._file = open(self.filename, "rb")
        elif self.ext == ".json":
            with open(self.filename, "rb") as f:
                self._storage = json.load(f)
        elif self.ext == ".kch":
            from kyotocabinet import DB
            self._storage = DB()
            self._storage.open(self.filename, DB.OREADER)
        elif self.ext == ".sqlite3":
            import sqlite3
            self._storage = sqlite3.connect(self.filename)
            self._cursor = self._storage.cursor()
        else:
            raise dexy.exceptions.UserFeedback("unsupported extension %s" %
                                               self.ext)

    def save(self):
        if self.ext == ".csv":
            self._data_file.close()
        elif self.ext == ".json":
            with open(self.filename, "wb") as f:
                import json
                json.dump(self._storage, f)
        elif self.ext == ".kch":
            if not self._storage.close():
                raise dexy.exceptions.UserFeedback(self._storage.error())
        elif self.ext == ".sqlite3":
            self._storage.commit()
            self._cursor.close()
        else:
            raise dexy.exceptions.UserFeedback("unsupported extension %s" %
                                               self.ext)
Ejemplo n.º 38
0
def get_user_stats(username, db_file):
    item = {
        "username": username,
        "aliases": [],
        "total_posts": 0,
        "domains": {},
        "first_post_date": None,
        "first_post_date_unix": None,
        "most_recent_post": None,
        "most_recent_post_unix": 0,
        "average_posts_per_hour": 0.0,
        "average_posts_per_day": 0.0,
        "average_posts_per_week": 0.0
    }

    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        if loaded_rec['person'] != username:
            cur.step()
            continue

        # Looks like this is a post by the user we're looking for
        split = get_domain(loaded_rec)

        if item['domains'].get(split, False) == False:
            item['domains'][split] = 1
        else:
            item['domains'][split] = item['domains'][split] + 1

        if item['first_post_date_unix'] is None:
            item['first_post_date_unix'] = loaded_rec['created_at']

        if item['most_recent_post_unix'] < loaded_rec['created_at']:
            item['most_recent_post_unix'] = loaded_rec['created_at']

        item['total_posts'] = item['total_posts'] + 1

        cur.step()

    cur.disable()
    db.close()

    # Clean up everything

    first_time = None
    if item['first_post_date_unix'] is not None:
        unix = float(item['first_post_date_unix'])
        first_time = datetime.fromtimestamp(unix)
        item['first_post_date'] = first_time.isoformat()

    recent_time = None
    if item['most_recent_post_unix'] is not None:
        unix = float(item['most_recent_post_unix'])
        recent_time = datetime.fromtimestamp(unix)
        item['most_recent_post'] = recent_time.isoformat()

    if first_time and recent_time:
        delta = recent_time - first_time
        item['user_age_days'] = delta.days
        item['user_age_seconds'] = delta.total_seconds()
        item['average_posts_per_hour'] = item['total_posts'] / (
            delta.total_seconds() / 60.0)
        item['average_posts_per_day'] = item['total_posts'] / (
            delta.total_seconds() / 60.0 / 24.0)
        item['average_posts_per_week'] = item['total_posts'] / (
            delta.total_seconds() / 60.0 / 24.0 / 7.0)

    return item
class FeatureSelector( Frontend ):


  def __init__( self, fn, mode ):    

    Frontend.__init__( self, fn, mode );

    self._kdbfn = None;
    self._kdb = None;

    self._ldbdn = None;
    self._ldb = None;

    self._len_c = None;
    self._len_b = None;
    self._len_x = None;

    self._ic = None;
    self._icbp = None;

    self._needs_initialization = True;

    self._core_dims = set();
    self._satellite_dims = set();
    self._removed_dims = set();

    self._remove_c = set();
    self._remove_b = set();
    self._remove_x = set();

    self.bypass_c = False;
    self.bypass_b = False;
    self.bypass_x = False;


  def __enter__( self ):

    if self._mode == "r":
      with open( self._fn, "rb" ) as f:
        state = pickle_load( f );
        self._len_c = state[ "c" ];
        self._len_b = state[ "b" ];
        self._len_x = state[ "x" ];
        self._lenrow = self._len_c + self._len_b + self._len_x;
        self._ic = state[ "ic" ];
        self._icbp = state[ "icbp" ];

    if self._mode == "w":

      with NamedTemporaryFile() as tmpfn:
        self._kdbfn = tmpfn.name + '.kch';
      self._kdb = KDB();
      try:
        assert self._kdb.open( self._kdbfn, KDB.OWRITER | KDB.OCREATE );
      except:
        print( str( self._kdb.error() ) );
        raise;

      with TemporaryDirectory() as tmpdirname:
        self._ldbdn = tmpdirname;
      self._ldb = LDB( self._ldbdn, create_if_missing=True );

    return self;

  def __exit__( self, exc_type, exc_value, traceback ):

    assert Frontend.__exit__( self, exc_type, exc_value, traceback ) == False;

    if self._ldb is not None:
      sleep( 3.0 );
      self._ldb.close()

    if self._ldbdn is not None:
      rmtree( self._ldbdn );

    if self._kdb is not None:
      try:
        assert self._kdb.close();
      except:
        print( str( self._kdb.error() ) );
        raise;

    if self._kdbfn is not None:
      remove( self._kdbfn );


  def train( self, row ):

    ( y, c, b, x ) = row;

    if self._len_c is None:
      self._len_c = len(c);
    assert self._len_c == len(c);

    if self._len_b is None:
      self._len_b = len(b);
    assert self._len_b == len(b);

    if self._len_x is None:
      self._len_x = len(x);
    assert self._len_x == len(x);

    row = c + b + x;

    if Frontend.train( self, row ):
      return True;

    keyfmt = '>IIIII';

    for i in range( 0, self._lenrow ):
      for j in range( 0, self._lenrow ):

        if ( i >= j ) and ( not ( i == self._lenrow-1 ) ):
          continue;

        key = pack( keyfmt, i, j, y, row[i], row[j] );

        try:
          assert self._kdb.increment( key, 1, 0 );
        except:
          print( str(self._kdb.error()) );
          raise;


  def _stats( self, cnt_by_a, cnt_by_b, cnt_by_ab ):

    h_a = 0.0;
    h_b = 0.0;
    h_ab = 0.0;

    for ( val_a, cnt ) in cnt_by_a.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_a -= p * log( p, 2.0 );

    for ( val_b, cnt ) in cnt_by_b.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_b -= p * log( p, 2.0 );

    for( (val_a,val_b), cnt ) in cnt_by_ab.items():
      p = float(cnt) / float(self._rowcount);
      if p > 0.0:      
        h_ab -= p * log( p, 2.0 );

    if h_a == 0.0:
      return 1.0;

    if h_b == 0.0:
      return 1.0;
    
    mi = h_a + h_b - h_ab;
    return ( mi / min( h_a, h_b ), h_a, h_b, h_ab, mi );


  def _get_info_content_by_dimension( self, i ):

    keyfmt = '>IIIII';
    valfmt = '>Q';

    j = None;

    cnt_by_a = {};
    cnt_by_b = {};
    cnt_by_ab = {};
    total = 0;

    with self._ldb.iterator() as it:

      it.seek( pack( keyfmt, i,0,0,0,0 ) );

      for ( key, val ) in it:

        key = unpack( keyfmt, key );
        val = unpack( valfmt, val )[ 0 ];

        if not ( key[0] == i ):
          break;

        if j is None:
          j = key[1];

        if not ( key[1] == j ):
          break;

        # key[2] is the y-value
        a = key[2];

        # key[3] is the value for the i-th dimension
        b = key[3];

        cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val;
        cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val;
        cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val;

        total += val;

    try:
      assert total == self._rowcount;
    except:
      print( i, j, total, self._rowcount );
      raise;

    return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab );


  def _get_info_content_by_pair( self, i, j ):

    keyfmt = '>IIIII';
    valfmt = '>Q';

    cnt_by_a = {};
    cnt_by_b = {};
    cnt_by_ab = {};
    total = 0;

    with self._ldb.iterator() as it:

      it.seek( pack( keyfmt, i,j,0,0,0 ) );

      for ( key, val ) in it:

        key = unpack( keyfmt, key );
        val = unpack( valfmt, val )[ 0 ];

        if not ( ( key[0] == i ) and ( key[1] == j ) ):
          break;

        # key[2] is the y-value, key[3] the i-th value for the i-th dim
        a = ( key[2], key[3] ); 

        # key[2] is the y-value, key[4] the i-th value for the j-th dim
        b = ( key[2], key[4] );

        assert (a,b) not in cnt_by_ab;
        cnt_by_ab[ (a,b) ] = cnt_by_ab.get( (a,b), 0 ) + val;

        cnt_by_a[ a ] = cnt_by_a.get( a, 0 ) + val;
        cnt_by_b[ b ] = cnt_by_b.get( b, 0 ) + val;

        total += val;

    assert total == self._rowcount;

    return self._stats( cnt_by_a, cnt_by_b, cnt_by_ab );


  def _finalize( self ):

    assert Frontend._finalize( self ) is None;

    if False:
      print( "unique combinations = ", self._kdb.count() );

    keyfmt = '>IIIII';
    valfmt = '>Q';

    c = self._kdb.cursor();
    c.jump();

    gt2 = 0;
    gt4 = 0;
    gt8 = 0;
    gt16 = 0;
    gt32 = 0;

    while True:

      r = c.get( True );
      if not r:
        break;

      self._ldb.put( r[0], r[1] );

      key = unpack( keyfmt, r[0] );
      val = unpack( valfmt, r[1] )[ 0 ];

      if val > 2:
        gt2 += 1;
      if val > 4:
        gt4 += 1;
      if val > 8:
        gt8 += 1;
      if val > 16:
        gt16 += 1;
      if val > 32:
        gt32 += 1;

    if False:
      print( gt2, gt4, gt8, gt16, gt32 );

    self._ic = {};
    for i in range( 0, self._lenrow ):
      self._ic[ i ] = self._get_info_content_by_dimension( i );

    self._icbp = {};

    for i in range( 0, self._lenrow ):
      for j in range( 0, self._lenrow ):

        if i >= j:
          continue;

        self._icbp[ (i,j) ] = self._get_info_content_by_pair( i, j );


    self._state \
      = { "ic": self._ic,
          "icbp": self._icbp,
          "c": self._len_c,
          "b": self._len_b,
          "x": self._len_x };


  def _fmt_dim( self, d_ ):

    d = None;
    if d_ < self._len_c:
      d = "c" + str( d_ );
    elif d_ < self._len_c + self._len_b:
      d = "b" + str( d_ - self._len_c );
    elif d_ < self._len_c + self._len_b + self._len_x:
      d = "x" + str( d_ - self._len_c - self._len_b );
    else:
      assert False;
    return "{:d}({:s})".format( d_, d );


  def _init( self ):

    self._needs_initialization = False;

    if False:

      for i in sorted( self._ic ):

        (corr,h_a,h_b,h_ab,mi) = self._ic[ i ];

        print(
            "{:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
             .format(
                  self._fmt_dim( i ),
                  corr,
                  h_a,
                  h_b,
                  h_ab,
                  mi
                )
          );

      for (i,j) in sorted( self._icbp ):

        (corr,h_a,h_b,h_ab,mi) = self._icbp[ (i,j) ];

        print(
            "{:s} {:s} {:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
             .format(
                  self._fmt_dim( i ),
                  self._fmt_dim( j ),
                  corr,
                  h_a,
                  h_b,
                  h_ab,
                  mi
                )
          );

    entropy \
      = [ ( h_ab, i ) \
          for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ];          

    output_correlation \
      = [ ( corr, i ) \
          for ( i, (corr,h_a,h_b,h_ab,mi) ) in self._ic.items() ];

    self._core_dims = set();

    self._core_dims \
      |= { i \
           for ( h_ab, i ) \
           in sorted( entropy, reverse=True )[ :5 ] };

    self._core_dims \
      |= { i \
           for ( h_ab, i ) \
           in sorted( output_correlation, reverse=True )[ :3 ] };

    if True:
      print(
          "core = ",
          " ".join([ self._fmt_dim(d) for d in self._core_dims ])
        );

    self._satellite_dims = set();

    for core_dim in self._core_dims:

      satellite_dim = None;
      satellite_dim_c = None;
      satellite_dim_stats = None;

      for ( (i,j), (corr,h_a,h_b,h_ab,mi) ) in self._icbp.items():

        if corr <= 0.5:
          continue;

        other_dim = None;
        if i == core_dim:
          other_dim = j;
        elif j == core_dim:
          other_dim = i;
        else:
          continue;

        if ( satellite_dim_c is None ) or ( corr > satellite_dim_c ):

          satellite_dim = other_dim;
          satellite_dim_c = corr;
          satellite_dim_stats = (corr,h_a,h_b,h_ab,mi);

      if satellite_dim is not None:

        self._satellite_dims.add( satellite_dim );      

        if False:

          print(
              '->',
              self._fmt_dim(core_dim),
              self._fmt_dim(satellite_dim)
            );

          print(
              "{:1.4f} {:1.4f} {:1.4f} {:1.4f} {:1.4f}"\
               .format( *(corr,h_a,h_b,h_ab,mi) )
            );

    if True:

      print(
          "satellite = ",
          " ".join([ self._fmt_dim(d) for d in self._satellite_dims ])
        );

    self._removed_dims = set();
    for i in self._ic:
      if i not in self._core_dims and i not in self._satellite_dims:
        self._removed_dims.add( i );

    if True:

      print(
          "removed = ",
          " ".join([ self._fmt_dim(d) for d in self._removed_dims ])
        );

    for d_ in self._removed_dims:
      if d_ < self._len_c:
        self._remove_c.add( d_ );
      elif d_ < self._len_c + self._len_b:
        self._remove_b.add( d_ - self._len_c );
      elif d_ < self._len_c + self._len_b + self._len_x:
        self._remove_x.add( d_ - self._len_c - self._len_b );
      else:
        assert False;


  def apply_c( self, c ):

    if self.bypass_c:
      return c;

    if self._needs_initialization:
      self._init();

    c_ = [];
    for ( i, cval ) in enumerate( c ):
      if not i in self._remove_c:
        c_.append( cval );
    return c_;


  def apply_b( self, b ):

    if self.bypass_b:
      return b;

    if self._needs_initialization:
      self._init();

    b_ = [];
    for ( i, bval ) in enumerate( b ):
      if not i in self._remove_b:
        b_.append( bval );
    return b_;


  def apply_x( self, x ):

    if self.bypass_x:
      return x;

    if self._needs_initialization:
      self._init();

    x_ = [];
    for ( i, xval ) in enumerate( x ):
      if not i in self._remove_x:
        x_.append( xval );
    return x_;


  def __call__( self, row ):

    if self._needs_initialization:
      self._init();

    ( y, c, b, x ) = row;

    y_ = y;

    return \
      ( y_,
        self.apply_c( c ),
        self.apply_b( b ),
        self.apply_x( x ) );
Ejemplo n.º 40
0
   not db.set("user:"******":sex", "male") or\
   not db.set("user:"******":id", "10") or\
   not db.set("user:"******":ups", "100") or\
   not db.set("user:"******":downs", "10") or\
   not db.set("user:"******":hits", "110") or\
   not db.set("user:count", "100") or\
   not db.set("user:id:1", "*****@*****.**") or\
   not db.set("user:id:2", "*****@*****.**") or\
   not db.set("user:id:3", "*****@*****.**"):
         print >>sys.stderr, "set error: " + str(db.error())

# retrieve records
o= format_datetime(float(db.get("file:"+key+":createdata")))
v=db.get("file:"+key+":key")
va=db.get("mark:"+cid+":description")
val = db.get("category:"+cid+":name")
valu = db.get("category:1:id")
value = db.get("user:id:1")
if value:
         print o
         print v
         print va
         print val
         print valu
         print value
else:
         print >>sys.stderr, "get error: " + str(db.error())

# close the database
if not db.close():
         print >>sys.stderr, "close error: " + str(db.error())
Ejemplo n.º 41
0
def get_user_stats(username, db_file):
    item = {
        "username": username,
        "aliases": [],
        "total_posts": 0,
        "domains": {},
        "first_post_date": None,
        "first_post_date_unix": None,
        "most_recent_post": None,
        "most_recent_post_unix": 0,
        "average_posts_per_hour": 0.0,
        "average_posts_per_day": 0.0,
        "average_posts_per_week": 0.0
    }

    db = DB()
    if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE):
        print "Could not open database."

    cur = db.cursor()
    cur.jump()
    while True:
        rec = cur.get(False)
        if not rec:
            break

        loaded_rec = loads(rec[1])
        if loaded_rec['person'] != username:
            cur.step()
            continue

        # Looks like this is a post by the user we're looking for
        split = get_domain(loaded_rec)

        if item['domains'].get(split, False) == False:
           item['domains'][split] = 1
        else:
            item['domains'][split] = item['domains'][split] + 1

        if item['first_post_date_unix'] is None:
            item['first_post_date_unix'] = loaded_rec['created_at']

        if item['most_recent_post_unix'] < loaded_rec['created_at']:
            item['most_recent_post_unix'] = loaded_rec['created_at']

        item['total_posts'] = item['total_posts'] + 1

        cur.step()

    cur.disable()
    db.close()

    # Clean up everything

    first_time = None
    if item['first_post_date_unix'] is not None:
        unix = float(item['first_post_date_unix'])
        first_time = datetime.fromtimestamp(unix)
        item['first_post_date'] = first_time.isoformat()

    recent_time = None
    if item['most_recent_post_unix'] is not None:
        unix = float(item['most_recent_post_unix'])
        recent_time = datetime.fromtimestamp(unix)
        item['most_recent_post'] = recent_time.isoformat()

    if first_time and recent_time:
        delta = recent_time - first_time
        item['user_age_days'] = delta.days
        item['user_age_seconds'] = delta.total_seconds()
        item['average_posts_per_hour'] = item['total_posts'] / (delta.total_seconds() / 60.0)
        item['average_posts_per_day'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0)
        item['average_posts_per_week'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0 / 7.0)

    return item
Ejemplo n.º 42
0
class KyotoCabinetGraph(BaseGraph):
    def __init__(self, path):
        # create the database object
        self._path = path
        self._db = DB()
        # open the database
        if not self._db.open(path, DB.OREADER | DB.OWRITER | DB.OCREATE):
            raise GrapheekDataKyotoCabinetInitFailureException(
                str(self._db.error()))
        super(KyotoCabinetGraph, self).__init__()
        self._ensure_prepared()
        self._closed = False

    # Start method overriding :

    def _db_close(self):
        if not self._closed:
            self._db.close()

    def _transaction_begin(self):
        self._db.begin_transaction()
        return True

    def _transaction_commit(self, txn):
        self._db.end_transaction(True)

    def _transaction_rollback(self, txn):
        self._db.end_transaction(False)

    def _has_key(self, key):
        return self._db.check(key) >= 0

    def _get(self, txn, key):
        raw_data = self._db.get(key)
        if raw_data is None:
            return UNDEFINED  # Not returning None, as None is a valid value
        return msgpack.loads(raw_data, encoding='utf8')

    def _bulk_get(self, txn, keys):
        result = {}
        key_raw_datas = self._db.get_bulk(keys)
        for key, raw_data in list(key_raw_datas.items()):
            if PYTHON2:  # pragma : no cover
                k = key
            else:  # pragma : no cover
                k = str(key, encoding='utf8')
            result[k] = msgpack.loads(raw_data, encoding='utf8')
        return result

    def _set(self, txn, key, value):
        res = self._db.set(key, msgpack.dumps(value, encoding='utf8'))
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _bulk_set(self, txn, updates):
        dic = {}
        for key, value in list(updates.items()):
            dic[key] = msgpack.dumps(value, encoding='utf8')
        res = self._db.set_bulk(dic)
        if res == -1:  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _remove(self, txn, key):
        # Contrary to LocalMemoryGraph implementation, it is not needed to wrap
        # key removal in try.. except because KyotoCabinet only send "False"
        # when key does not exist
        # Thus ... _removemethod is idempotent (cf LocalMemoryGraph _remove method comment)
        self._db.remove(key)

    def _bulk_remove(self, txn, keys):
        res = self._db.remove_bulk(list(keys))
        if res == -1:  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _remove_prefix(self, txn, prefix):
        keys = self._db.match_prefix(prefix)
        self._db.remove_bulk(keys)

    # overriding list methods
    # looks like a bucket of hacks, and yes indeed it is :)
    # btw, it REALLY improves performance if we compare to default implementation which,
    # in the case of KyotoCabinet would involve msgpack deserialization followed by a serialization

    def _init_lst(self, txn, key):
        res = self._db.set(key, '')
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _get_lst(self, txn, key):
        value = self._db.get(key)
        if value is None:
            return UNDEFINED
        # look _append_to_lst code below to understand why a split is done
        # And why resulting list is sliced from 1
        if PYTHON2:  # pragma : no cover
            return list(map(int, value.split('|')[1:]))
        return list(map(
            int,
            str(value, encoding='utf8').split('|')[1:]))  # pragma : no cover

    def _set_lst(self, txn, key, values):
        newval = '|'.join([str(value) for value in values])
        res = self._db.set(key, '|' + newval)
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _bulk_get_lst(self, txn, keys):
        dic_values = self._db.get_bulk(keys)
        results = []
        for key in keys:
            if PYTHON2:  # pragma : no cover
                values = dic_values.get(key, UNDEFINED)
            else:  # pragma : no cover
                values = dic_values.get(bytes(key, encoding='utf8'), UNDEFINED)
            if values == UNDEFINED:
                results.append([])
            else:
                if PYTHON2:  # pragma : no cover
                    results.append(list(map(int, values.split('|')[1:])))
                else:  # pragma : no cover
                    results.append(
                        list(
                            map(int,
                                str(values, encoding='utf8').split('|')[1:])))
        return results

    def _append_to_lst(self, txn, key, value):
        self._db.append(key, '|' + str(value))

    def _bulk_append_to_lst(self, txn, key, values):
        newval = '|'.join([str(value) for value in values])
        self._db.append(key, '|' + newval)

    def _remove_from_lst(self, txn, key, value):
        old = self._db.get(key)
        if not PYTHON2:  # pragma : no cover
            old = str(old, encoding='utf8')
        # Caution : we are only removing ONE occurence
        # This is voluntary
        # For instance, it lst contains neighbour node, we need to remove only one occurence
        # cause current entity and neighbour node can be linked multiple time
        new = old.replace('|%s' % value, '', 1)
        if new == old:
            raise ValueError("list.remove(x): x not in list")
        res = self._db.set(key, new)
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res

    def _bulk_remove_from_lst(self, txn, key, values):
        assert (len(values))
        old = self._db.get(key)
        if PYTHON2:  # pragma : no cover
            new = old
        else:  # pragma : no cover
            new = str(old, encoding='utf8')
        for value in values:
            new = new.replace('|%s' % value, '', 1)
        if new == old:  # pragma : no cover
            raise ValueError("list.remove(x): x not in list")
        res = self._db.set(key, new)
        if not (res):  # pragma : no cover
            raise GrapheekDataKyotoCabinetException(
                'KyotoCabinet : error while saving')
        return res