Ejemplo n.º 1
0
def im_webpage(srcs):
    """ Import web pages from file to database.
    """
    # Connect to MySQL database
    warnings.simplefilter("error", MySQLdb.Warning)
    cur = CONN_POOL.get_cur(GEOTWEET)
    i, k = 0, 0
    for line in fileinput.input(srcs, openhook=fileinput.hook_compressed):
        try:
            k += 1
            tjson = json.loads(line)
            text = tjson['web']
            if not isreadable(text):
                print text[:80].encode('cp1252', errors='ignore')
                continue
            item = (tjson['place_id'], \
                    html_filter(text).encode('utf-8', errors='ignore'))
            cur.execute(
                'INSERT INTO web ( \
                    place_id, \
                    web) \
                    VALUES(%s,%s)', item)
            i += 1
        except StandardError:
            print 'Fail at line {0}'.format(k)
            print traceback.print_exc(file=sys.stdout)
    logging.info('Import web pages::{0} out of {1} imported.'.format(i, k))
    logging.info('------------------------------------------')
Ejemplo n.º 2
0
def filter_tweet():
    """get rid of square game text"""
    scur = CONN_POOL.get_cur(GEOTWEET)
    dcur = CONN_POOL.get_cur(GEOTWEET)

    scur.execute('select id, text from tweet')
    i, k = 0, 0
    for tweet in scur:
        i += 1
        if len(get_tokens(tweet['text'])) > 0:
            dcur.execute(
                'insert into `sample` \
                    select * from `tweet`\
                    where `tweet`.`id` = %s', tweet['id'])
            k += 1
    logging.info('{0} out of {1} tweets are transferred'.format(k, i))
Ejemplo n.º 3
0
def place_name(pid, dbconf=GEOTWEET):
    """Return place name given a pid"""
    if IDPTN.match(pid) is None:
        return pid
    cur = CONN_POOL.get_cur(dbconf)
    cur.execute("select name from place where id=%s", (pid,))
    return cur.fetchone()['name']
Ejemplo n.º 4
0
def im_webpage(srcs):
    """ Import web pages from file to database.
    """
    # Connect to MySQL database
    warnings.simplefilter("error", MySQLdb.Warning)
    cur = CONN_POOL.get_cur(GEOTWEET)
    i, k = 0, 0
    for line in fileinput.input(srcs, openhook=fileinput.hook_compressed):
        try:
            k += 1
            tjson = json.loads(line)
            text = tjson["web"]
            if not isreadable(text):
                print text[:80].encode("cp1252", errors="ignore")
                continue
            item = (tjson["place_id"], html_filter(text).encode("utf-8", errors="ignore"))
            cur.execute(
                "INSERT INTO web ( \
                    place_id, \
                    web) \
                    VALUES(%s,%s)",
                item,
            )
            i += 1
        except StandardError:
            print "Fail at line {0}".format(k)
            print traceback.print_exc(file=sys.stdout)
    logging.info("Import web pages::{0} out of {1} imported.".format(i, k))
    logging.info("------------------------------------------")
Ejemplo n.º 5
0
def sparsitysetup(nums):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is a block of tweets
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    for num in nums:
        with open('chicago10.lst') as fin:
            twt = Dataset()
            places = [p.strip() for p in fin]
            lmplc = dict()
            lmtwt = Dataset()
            for pid in places:
                cur = CONN_POOL.get_cur(GEOTWEET)
                cur.execute('select text from sample' \
                        ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 160))
                text = [row['text'] for row in cur]
                lmplc[pid] = lmfromtext(text[:num])
                for txt in text[150:160]:
                    lmtwt.append({'pid': pid, 'lm': lmfromtext([txt,])})
            ranks = list()
            for item in lmtwt:
                ranks.append(ranke(lmplc, item['lm']))
            gch = batcheval(lmtwt['pid'], len(places), ranks)
            plt.plot(gch['pos'], gch['rate'],
                    lsts.next(), label='t={0}'.format(num))
    plt.xlabel('First $n$ places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()
Ejemplo n.º 6
0
def confusionmatrix(places):
    """ Show the matrix of confusion between LMs by KL-divergence
    """
    lmtwt1 = dict()
    lmtwt2 = dict()
    for pid in places:
        cur = CONN_POOL.get_cur(GEOTWEET)
        cur.execute('select text from sample' \
                ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200))
        text = [row['text'] for row in cur]
        lmtwt1[pid] = lmfromtext(text[:80])
        lmtwt2[pid] = lmfromtext(text[81:160])
    confmat = list()
    for lm_i in places:
        confmat.append(
            [kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places])

    selfavg = sum([confmat[i][i] for i in range(len(places))])
    mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg
    selfavg /= float(len(places))
    mutavg /= float(len(places) * len(places) - len(places))
    print selfavg, mutavg

    plt.imshow(np.array(confmat), cmap=cm.gray, interpolation='nearest')
    plt.yticks(range(len(places)), \
            ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))])
    plt.xticks(range(len(places)))
    plt.subplots_adjust(left=0.4)
    plt.colorbar(shrink=0.66)
    plt.savefig('sf_confm.eps')
    plt.show()
Ejemplo n.º 7
0
def confusionmatrix(places):
    """ Show the matrix of confusion between LMs by KL-divergence
    """
    lmtwt1 = dict()
    lmtwt2 = dict()
    for pid in places:
        cur = CONN_POOL.get_cur(GEOTWEET)
        cur.execute('select text from sample' \
                ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200))
        text = [row['text'] for row in cur]
        lmtwt1[pid] = lmfromtext(text[:80])
        lmtwt2[pid] = lmfromtext(text[81:160])
    confmat = list()
    for lm_i in places:
        confmat.append([kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places])

    selfavg = sum([confmat[i][i] for i in range(len(places))])
    mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg
    selfavg /= float(len(places))
    mutavg /= float(len(places)*len(places) - len(places))
    print selfavg, mutavg


    plt.imshow(np.array(confmat), cmap = cm.gray, interpolation='nearest')
    plt.yticks(range(len(places)), \
            ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))])
    plt.xticks(range(len(places)))
    plt.subplots_adjust(left=0.4)
    plt.colorbar(shrink=0.66)
    plt.savefig('sf_confm.eps')
    plt.show()
Ejemplo n.º 8
0
def tweet_count(dbconf, table):
    """get count of tweets in database"""
    cur = CONN_POOL.get_cur(dbconf)
    cur.execute('select count(*) as cnt from {0}'.format(table))
    row = cur.fetchone()
    print 'Count:{0}'.format(row['cnt'])
    return row['cnt']
Ejemplo n.º 9
0
def qloadrows(config, query):
    """Load tweets to list on conditions"""
    cur = CONN_POOL.get_cur(config)
    print query
    cur.execute(query)
    print 'Count: {0}'.format(cur.rowcount)
    return Dataset().extend([row for row in cur])
Ejemplo n.º 10
0
def place_name(pid, dbconf=GEOTWEET):
    """Return place name given a pid"""
    if IDPTN.match(pid) is None:
        return pid
    cur = CONN_POOL.get_cur(dbconf)
    cur.execute("select name from place where id=%s", (pid, ))
    return cur.fetchone()['name']
Ejemplo n.º 11
0
def qloadrows(config, query):
    """Load tweets to list on conditions"""
    cur = CONN_POOL.get_cur(config)
    print query
    cur.execute(query)
    print 'Count: {0}'.format(cur.rowcount)
    return Dataset().extend([row for row in cur])
Ejemplo n.º 12
0
def admin_dist(dbconf, pid):
    """get distribution of tweets in admin level"""
    cur = CONN_POOL.get_cur(dbconf)
    cur.execute("select id from place where superior_id=%s", pid)
    plcs = set()
    dist = dict()
    for row in cur:
        plcs = sub_place(row['id'], dbconf)
        dist[row['id']] = dict({'poi': len(plcs)})
        curx = CONN_POOL.get_cur(dbconf)
        for plc in plcs:
            curx.execute("select count(id) as cnt from sample where place_id=%s", plc)
            if 'cnt' not in dist[row['id']]:
                dist[row['id']]['cnt'] = curx.fetchone()['cnt']
            else:
                dist[row['id']]['cnt'] += curx.fetchone()['cnt']
    return dist
Ejemplo n.º 13
0
def im_place(srcs):
    """ Import places from file to database.
    """

    # Connect to MySQL database
    cur = CONN_POOL.get_cur(GEOTWEET)

    k, i = 0, 0
    fin = fileinput.FileInput(openhook=fileinput.hook_compressed)
    for line in fin.input(srcs):
        try:
            tjson = json.loads(line)
            k += 1
            lat = 0
            lng = 0
            if tjson["place_type"] != "country":
                lat = tjson["bounding_box"]["coordinates"][0][0][1]
                lng = tjson["bounding_box"]["coordinates"][0][0][0]

                item = (
                    tjson["id"],
                    tjson["name"],
                    tjson["place_type"],
                    tjson["contained_within"][0]["id"],
                    tjson["contained_within"][0]["name"],
                    tjson["contained_within"][0]["place_type"],
                    lat,
                    lng,
                    tjson["country_code"],
                )
            else:
                item = (tjson["id"], tjson["name"], None, None, None, None, None, tjson["country_code"])

            cur.execute(
                "INSERT INTO place ("
                "`id`, "
                "`name`, "
                "`type`, "
                "`superior_id`, "
                "`superior_name`, "
                "`superior_type`, "
                "`lat`, "
                "`lng`, "
                "`country`, "
                "`geo`)"
                "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,"
                "GeomFromText('Point({0} {1})'))".format(lat, lng),
                item,
            )
            cur.execute("INSERT INTO place_json (id, json) VALUES(%s,%s)", (tjson["id"], line))
            i += 1
        except _mysql_exceptions.IntegrityError:
            print "Import Places::Place ID {0} ignored for duplication.".format(tjson["id"])
        except StandardError:
            logging.error("Fail at line {0}".format(k))

    logging.info("Import Places::{0} out of {1} imported.".format(i, k))
    logging.info("------------------------------------------")
Ejemplo n.º 14
0
def filter_tweet():
    """get rid of square game text"""
    scur = CONN_POOL.get_cur(GEOTWEET)
    dcur = CONN_POOL.get_cur(GEOTWEET)

    scur.execute("select id, text from tweet")
    i, k = 0, 0
    for tweet in scur:
        i += 1
        if len(get_tokens(tweet["text"])) > 0:
            dcur.execute(
                "insert into `sample` \
                    select * from `tweet`\
                    where `tweet`.`id` = %s",
                tweet["id"],
            )
            k += 1
    logging.info("{0} out of {1} tweets are transferred".format(k, i))
Ejemplo n.º 15
0
def sub_place(pid, dbconf):
    """return a list of place contained within id"""
    cur = CONN_POOL.get_cur(dbconf)
    cur.execute("select id from place where superior_id=%s", pid)
    plc = set()
    for sub_pid in cur:
        plc.add(sub_pid['id'])
        plc.update(sub_place(sub_pid['id'], dbconf))
    return plc
Ejemplo n.º 16
0
def im_place_genre(srcs):
    """Update the genre of a place"""
    cur = CONN_POOL.get_cur(GEOTWEET)
    k = 0

    fi = fileinput.FileInput(openhook=fileinput.hook_compressed)
    for line in fi.input(srcs):
        k += 1
        try:
            rst = json.loads(line)
            pid = rst["t_place_id"]
            cur.execute(
                r"insert into foursquare_json(id, json) " "values(%s,%s)",
                (str(pid), json.dumps(rst["response"]["groups"])),
            )
        except _mysql_exceptions.IntegrityError:
            print "Import place genre::Place {0} ignored for duplication.".format(rst["t_place_id"])
        except StandardError as err:
            logging.error("Fail at line {0}, {1}".format(k, err.message))
    CONN_POOL.get_conn(GEOTWEET).commit()
Ejemplo n.º 17
0
def foursq_count(dbconf, table):
    """get count that contains http://4sq.com"""
    foursq_pattern = re.compile(r'http://4sq.com')
    cur = CONN_POOL.get_cur(dbconf)
    cur.execute('select text from {0}'.format(table))
    k = 0
    for row in cur:
        if foursq_pattern.search(row['text']) != None:
            k += 1
    print 'Foursqure.com tweet count:{0}'.format(k)
    return k
Ejemplo n.º 18
0
def im_place_genre(srcs):
    """Update the genre of a place"""
    cur = CONN_POOL.get_cur(GEOTWEET)
    k = 0

    fi = fileinput.FileInput(openhook=fileinput.hook_compressed)
    for line in fi.input(srcs):
        k += 1
        try:
            rst = json.loads(line)
            pid = rst['t_place_id']
            cur.execute(r'insert into foursquare_json(id, json) '
                    'values(%s,%s)', \
                    (str(pid), json.dumps(rst['response']['groups'])))
        except _mysql_exceptions.IntegrityError:
            print 'Import place genre::Place {0} ignored for duplication.'\
                    .format(rst['t_place_id'])
        except StandardError as err:
            logging.error('Fail at line {0}, {1}'.format(k, err.message))
    CONN_POOL.get_conn(GEOTWEET).commit()
Ejemplo n.º 19
0
def im_tweet(srcs):
    """ Import tweet from file to database.
    """

    # Connect to MySQL database
    cur = CONN_POOL.get_cur(GEOTWEET)
    i = 0
    k = 0
    for line in fileinput.input(srcs, openhook=fileinput.hook_compressed):
        try:
            tjson = json.loads(line)
            lat = tjson['place']['bounding_box'] \
                            ['coordinates'][0][0][1]
            lng = tjson['place']['bounding_box'] \
                            ['coordinates'][0][0][0]
            timestr = tjson['created_at']
            timestru = time.strptime(timestr, '%a %b %d %H:%M:%S +0000 %Y')
            #Wed Apr 14 18:51:32 +0000 2010
            timex = time.strftime('%Y-%m-%d %H:%M:%S', timestru)
            item = (tjson['id'], \
                    tjson['place']['id'], \
                    tjson['user']['id'], \
                    tjson['text'], \
                    lat, \
                    lng, \
                    timex)

            k += 1
            if len(get_tokens(tjson['text'])) > 0:
                cur.execute('INSERT INTO sample ('
                        'id, '
                        'place_id, '
                        'user_id, '
                        'text, '
                        'lat, '
                        'lng, '
                        'geo, '
                        'created_at) '
                        'VALUES(%s,%s,%s,%s,%s,%s,'
                        'GeomFromText(\'POINT({0} {1})\'),%s)'. \
                        format(lat, lng), item)
                #cur.execute('INSERT INTO tweet_json(id, json) VALUES(%s,%s)',
                #(tjson['id'], line))
            i += 1
        except _mysql_exceptions.IntegrityError:
            print 'Import Tweets::Tweet ID {0} ignored for duplication.'\
                    .format(tjson['id'])
        except StandardError:
            print 'Fail at line {0}'.format(k)
    logging.info('Import Tweet::{0} out of {1} imported.'.format(i, k))
    logging.info('------------------------------------------')
Ejemplo n.º 20
0
def loadrows(config, cols, wheres=None, table='sample', other=''):
    """Load tweets to list on conditions"""
    query = 'SELECT ' +  \
            ((', '.join(cols)) if cols!='*' else '*') \
            + ' FROM ' + table + \
            ((' WHERE ' + ' AND '.join(wheres)) if wheres else '') \
            + ' ' + other
    cur = CONN_POOL.get_cur(config)
    print query
    cur.execute(query)
    res = Dataset()
    for row in cur:
        twt = DataItem()
        for key in cols:
            twt[key] = row[key]
        res.append(twt)
    print 'Count: {0}'.format(cur.rowcount)
    return res
Ejemplo n.º 21
0
def loadrows(config, cols, wheres=None, table='sample', other=''):
    """Load tweets to list on conditions"""
    query = 'SELECT ' +  \
            ((', '.join(cols)) if cols!='*' else '*') \
            + ' FROM ' + table + \
            ((' WHERE ' + ' AND '.join(wheres)) if wheres else '') \
            + ' ' + other
    cur = CONN_POOL.get_cur(config)
    print query
    cur.execute(query)
    res = Dataset()
    for row in cur:
        twt = DataItem()
        for key in cols:
            twt[key] = row[key]
        res.append(twt)
    print 'Count: {0}'.format(cur.rowcount)
    return res
Ejemplo n.º 22
0
def im_tweet(srcs):
    """ Import tweet from file to database.
    """

    # Connect to MySQL database
    cur = CONN_POOL.get_cur(GEOTWEET)
    i = 0
    k = 0
    for line in fileinput.input(srcs, openhook=fileinput.hook_compressed):
        try:
            tjson = json.loads(line)
            lat = tjson["place"]["bounding_box"]["coordinates"][0][0][1]
            lng = tjson["place"]["bounding_box"]["coordinates"][0][0][0]
            timestr = tjson["created_at"]
            timestru = time.strptime(timestr, "%a %b %d %H:%M:%S +0000 %Y")
            # Wed Apr 14 18:51:32 +0000 2010
            timex = time.strftime("%Y-%m-%d %H:%M:%S", timestru)
            item = (tjson["id"], tjson["place"]["id"], tjson["user"]["id"], tjson["text"], lat, lng, timex)

            k += 1
            if len(get_tokens(tjson["text"])) > 0:
                cur.execute(
                    "INSERT INTO sample ("
                    "id, "
                    "place_id, "
                    "user_id, "
                    "text, "
                    "lat, "
                    "lng, "
                    "geo, "
                    "created_at) "
                    "VALUES(%s,%s,%s,%s,%s,%s,"
                    "GeomFromText('POINT({0} {1})'),%s)".format(lat, lng),
                    item,
                )
                # cur.execute('INSERT INTO tweet_json(id, json) VALUES(%s,%s)',
                # (tjson['id'], line))
            i += 1
        except _mysql_exceptions.IntegrityError:
            print "Import Tweets::Tweet ID {0} ignored for duplication.".format(tjson["id"])
        except StandardError:
            print "Fail at line {0}".format(k)
    logging.info("Import Tweet::{0} out of {1} imported.".format(i, k))
    logging.info("------------------------------------------")
Ejemplo n.º 23
0
def sparsitysetup(nums):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is a block of tweets
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    for num in nums:
        with open('chicago10.lst') as fin:
            twt = Dataset()
            places = [p.strip() for p in fin]
            lmplc = dict()
            lmtwt = Dataset()
            for pid in places:
                cur = CONN_POOL.get_cur(GEOTWEET)
                cur.execute('select text from sample' \
                        ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 160))
                text = [row['text'] for row in cur]
                lmplc[pid] = lmfromtext(text[:num])
                for txt in text[150:160]:
                    lmtwt.append({
                        'pid': pid,
                        'lm': lmfromtext([
                            txt,
                        ])
                    })
            ranks = list()
            for item in lmtwt:
                ranks.append(ranke(lmplc, item['lm']))
            gch = batcheval(lmtwt['pid'], len(places), ranks)
            plt.plot(gch['pos'],
                     gch['rate'],
                     lsts.next(),
                     label='t={0}'.format(num))
    plt.xlabel('First $n$ places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()
Ejemplo n.º 24
0
def im_place(srcs):
    """ Import places from file to database.
    """

    # Connect to MySQL database
    cur = CONN_POOL.get_cur(GEOTWEET)

    k, i = 0, 0
    fin = fileinput.FileInput(openhook=fileinput.hook_compressed)
    for line in fin.input(srcs):
        try:
            tjson = json.loads(line)
            k += 1
            lat = 0
            lng = 0
            if tjson['place_type'] != 'country':
                lat = tjson['bounding_box'] \
                                ['coordinates'][0][0][1]
                lng = tjson['bounding_box'] \
                                ['coordinates'][0][0][0]

                item = (tjson['id'], \
                        tjson['name'], \
                        tjson['place_type'], \
                        tjson['contained_within'][0]['id'], \
                        tjson['contained_within'][0]['name'], \
                        tjson['contained_within'][0]['place_type'], \
                        lat, \
                        lng, \
                        tjson['country_code'])
            else:
                item = (tjson['id'], \
                        tjson['name'], \
                        None,
                        None,
                        None,
                        None,
                        None,
                        tjson['country_code'])

            cur.execute('INSERT INTO place ('
            '`id`, '
            '`name`, '
            '`type`, '
            '`superior_id`, '
            '`superior_name`, '
            '`superior_type`, '
            '`lat`, '
            '`lng`, '
            '`country`, '
            '`geo`)'
            'VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,'
            'GeomFromText(\'Point({0} {1})\'))'.\
                    format(lat, lng), item)
            cur.execute('INSERT INTO place_json (id, json) VALUES(%s,%s)', \
                    (tjson['id'], line))
            i += 1
        except _mysql_exceptions.IntegrityError:
            print 'Import Places::Place ID {0} ignored for duplication.'.format(
                tjson['id'])
        except StandardError:
            logging.error('Fail at line {0}'.format(k))

    logging.info('Import Places::{0} out of {1} imported.'.format(i, k))
    logging.info('------------------------------------------')