コード例 #1
0
def main():

    country = 'uk'

    traj_table = 'tak.%s_traj' % country
    evnt_table = 'tak.%s_evnt' % country
    crash_table = 'tak.%s_crash' % country

    drop_table = True

    max_speed = 0.07
    space_treshold = 0.05
    time_treshold = 1200  # split traj if time gap between consecutive points > max_time_gap

    users = pd.read_csv('../../dataset/users_%s.csv' % country,
                        header=None)[0].values.tolist()
    client = pymongo.MongoClient('mongodb://username@ipaddress:port/')
    db = client['dataset2']
    con = database_io.get_connection()
    print(datetime.now(), 'Building trajectories')
    for i, uid in enumerate(users):
        traj_data = db.POSITIONS.find({
            'T&K_VOUCHER_ID': uid
        }).sort('TIMESTAMP_LOCAL', pymongo.ASCENDING)
        evnt_data = db.EVENTS.find({
            'T&K_VOUCHER_ID': uid
        }).sort('TIMESTAMP_LOCAL', pymongo.ASCENDING)
        crash_data = db.CRASH.find({
            'T&K_VOUCHER_ID': uid
        }).sort('TIMESTAMP_LOCAL', pymongo.ASCENDING)

        print(
            datetime.now(), 'Processing user %s, %s of %s (%.2f)' %
            (uid, i, len(users), 100.0 * i / len(users)))
        gps2trajevntcrash(traj_data,
                          evnt_data,
                          crash_data,
                          traj_table,
                          evnt_table,
                          crash_table,
                          con,
                          drop_table=drop_table,
                          max_speed=max_speed,
                          space_treshold=space_treshold,
                          time_treshold=time_treshold)
        drop_table = False

    print(datetime.now(), 'Process ended.')
コード例 #2
0
def imn_extract(filename,
                path,
                type_user,
                traj_table,
                evnt_table,
                min_traj_nbr,
                min_length,
                min_duration,
                area,
                overwrite=False):
    output_filename = path + filename

    con = database_io.get_connection()
    cur = con.cursor()

    #users_list = [100225,101127,100742,100747,100690,100578,1003,100191,100192,100193,321463]
    users_list = [100619, 100554]
    users_list = sorted(users_list)
    nbr_users = len(users_list)

    print("user ids before checking :")
    print(nbr_users, len(users_list))

    if os.path.isfile(output_filename) and not overwrite:
        processed_users = list()
        fout = gzip.GzipFile(output_filename, 'r')
        for row in fout:
            customer_obj = json.loads(row)
            processed_users.append(customer_obj['uid'])
        fout.close()
        users_list = [uid for uid in users_list if uid not in processed_users]
    print("user ids after checking :")
    print(nbr_users, len(users_list))

    for i, uid in enumerate(users_list):
        if i % 1 == 0:
            print(
                datetime.datetime.now(),
                '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i,
                                             nbr_users, i / nbr_users * 100.0))

        imh = database_io.load_individual_mobility_history(
            cur, uid, traj_table, min_length, min_duration)
        events = database_io.load_individual_event_history(
            cur, uid, evnt_table) if evnt_table is not None else None

        #imh['trajectories']=dict(list(islice(imh['trajectories'].items(), 200)))
        if len(imh['trajectories']) < min_traj_nbr:
            print('len trajectories]) < min_traj_nbr',
                  len(imh['trajectories']), min_traj_nbr)
            continue

        main_imh = imh['trajectories']
        jan_feb_tid = []
        march_april_id = []

        for tid, t in imh['trajectories'].items():
            start_time = str(t.start_time())
            if ('2017-01' in start_time) or ('2017-02' in start_time):
                jan_feb_tid.append(tid)

            if ('2017-03' in start_time) or ('2017-04' in start_time):
                march_april_id.append(tid)

        imh['trajectories'] = {x: imh['trajectories'][x] for x in jan_feb_tid}
        imn1 = individual_mobility_network.build_imn(imh,
                                                     reg_loc=True,
                                                     events=events,
                                                     verbose=False)
        period_imn1 = {"01-02": imn1}

        imh['trajectories'] = {x: main_imh[x] for x in march_april_id}
        imn2 = individual_mobility_network.build_imn(imh,
                                                     reg_loc=True,
                                                     events=events,
                                                     verbose=False)
        period_imn2 = {"03-04": imn2}

        customer_obj = {'uid': uid}
        period_imn1.update(period_imn2)
        customer_obj.update(period_imn1)

        json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj),
                                       default=agenda_converter)
        json_bytes = json_str.encode('utf-8')
        with gzip.GzipFile(output_filename, 'a') as fout:
            fout.write(json_bytes)
    print("done")
    cur.close()
    con.close()
コード例 #3
0
def imn_extract_for_one_month(filename,
                              path,
                              type_user,
                              traj_table,
                              evnt_table,
                              min_traj_nbr,
                              min_length,
                              min_duration,
                              area,
                              overwrite=False):
    output_filename = path + filename
    con = database_io.get_connection()
    cur = con.cursor()
    users_list = find_user_list(cur, traj_table)
    nbr_users = len(users_list)
    print("user ids before checking :")
    print(nbr_users, len(users_list))
    if os.path.isfile(output_filename) and not overwrite:
        processed_users = list()
        fout = gzip.GzipFile(output_filename, 'r')
        for row in fout:
            customer_obj = json.loads(row)
            processed_users.append(customer_obj['uid'])
        fout.close()
        users_list = [uid for uid in users_list if uid not in processed_users]
    print("user ids after checking :")
    print(nbr_users, len(users_list))

    for i, uid in enumerate(users_list):
        try:
            if i % 1 == 0:
                print(
                    datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' %
                    (traj_table, area, type_user, i, nbr_users,
                     i / nbr_users * 100.0))

            imh = database_io.load_individual_mobility_history(
                cur, uid, traj_table, min_length, min_duration)
            events = database_io.load_individual_event_history(
                cur, uid, evnt_table) if evnt_table is not None else None
            if len(imh['trajectories']) < min_traj_nbr:
                print('len trajectories]) < min_traj_nbr',
                      len(imh['trajectories']), min_traj_nbr)
                continue

            main_imh = imh['trajectories']
            jan_tid = []
            for tid, t in imh['trajectories'].items():
                start_time = str(t.start_time())
                if ('2017-01' in start_time):
                    jan_tid.append(tid)

            imh['trajectories'] = {x: imh['trajectories'][x] for x in jan_tid}
            imn1 = individual_mobility_network.build_imn(imh,
                                                         reg_loc=True,
                                                         events=events,
                                                         verbose=False)
            period_imn1 = {"01": imn1}
            customer_obj = {'uid': uid}
            customer_obj.update(period_imn1)
            json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj),
                                           default=agenda_converter)
            json_bytes = json_str.encode('utf-8')
            with gzip.GzipFile(output_filename, 'a') as fout:
                fout.write(json_bytes)
        except (TypeError):
            print("type error")
            continue
    print("done")
    cur.close()
    con.close()
コード例 #4
0
def imn_extract_all_year(filename,
                         path,
                         type_user,
                         traj_table,
                         evnt_table,
                         min_traj_nbr,
                         min_length,
                         min_duration,
                         area,
                         overwrite=False):
    output_filename = path + filename

    con = database_io.get_connection()
    cur = con.cursor()
    #users_list=find_user_list(cur,traj_table):
    #users_list = [100225,101127,100742,100747,100690,100578,1003,100191,100192,100193,318819,100619,100554,100498]
    #users_list = [100843,100836,100827,100795,100747,100717,100681,100669,101293,101194,101091]
    users_list = [7925]
    users_list = sorted(users_list)
    nbr_users = len(users_list)

    print("user ids before checking :")
    print(nbr_users, len(users_list))

    if os.path.isfile(output_filename) and not overwrite:
        processed_users = list()
        fout = gzip.GzipFile(output_filename, 'r')
        for row in fout:
            customer_obj = json.loads(row)
            processed_users.append(customer_obj['uid'])
        fout.close()
        users_list = [uid for uid in users_list if uid not in processed_users]
    print("user ids after checking :")
    print(nbr_users, len(users_list))

    for i, uid in enumerate(users_list):
        try:
            if i % 1 == 0:
                print(
                    datetime.datetime.now(), '%s %s %s [%s/%s] - %.2f' %
                    (traj_table, area, type_user, i, nbr_users,
                     i / nbr_users * 100.0))

            imh = database_io.load_individual_mobility_history(
                cur, uid, traj_table, min_length, min_duration)
            events = database_io.load_individual_event_history(
                cur, uid, evnt_table) if evnt_table is not None else None

            #imh['trajectories']=dict(list(islice(imh['trajectories'].items(), 200)))
            if len(imh['trajectories']) < min_traj_nbr:
                print('len trajectories]) < min_traj_nbr',
                      len(imh['trajectories']), min_traj_nbr)
                continue
            imn = individual_mobility_network.build_imn(imh,
                                                        reg_loc=True,
                                                        events=events,
                                                        verbose=False)
            customer_obj = {'uid': uid}
            customer_obj.update(imn)

            json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj),
                                           default=agenda_converter)
            json_bytes = json_str.encode('utf-8')
            with gzip.GzipFile(output_filename, 'a') as fout:
                fout.write(json_bytes)
        except (TypeError):
            print("type error")
            continue
    print("done")
    cur.close()
    con.close()
コード例 #5
0
def main():

    area = sys.argv[1]  # 'rome' 'tuscany' 'london'
    type_user = sys.argv[2]  # 'crash' 'nocrash'
    overwrite = int(sys.argv[3])
    country = 'uk' if area == 'london' else 'italy'

    min_length = 1.0
    min_duration = 60.0

    print(datetime.datetime.now(), 'Crash Prediction - Train Test Partitioner')
    if not overwrite:
        print(datetime.datetime.now(), '(restart)')

    path = './'
    path_imn = path + 'imn_new/'
    path_dataset = path + 'dataset/'
    path_traintest = path + 'traintest/'
    path_quadtree = path + 'quadtree/'

    traj_table = 'tak.%s_traj' % country
    evnt_table = 'tak.%s_evnt' % country
    crash_table = 'tak.%s_crash' % country

    if area == 'london' and type_user == 'nocrash':
        users_filename = path_dataset + '%s_%s_users_list.csv' % (area, 'all')
        users_filename_crash = path_dataset + '%s_%s_users_list.csv' % (
            area, 'crash')
    else:
        users_filename = path_dataset + '%s_%s_users_list.csv' % (area,
                                                                  type_user)
        users_filename_crash = None

    users_list = pd.read_csv(users_filename).values[:, 0].tolist()
    users_list = sorted(users_list)

    if users_filename_crash is not None:
        users_list_crash = pd.read_csv(
            users_filename_crash).values[:, 0].tolist()
        users_list_crash = sorted(users_list_crash)
        users_list = [uid for uid in users_list if uid not in users_list_crash]

    nbr_users = len(users_list)

    print(datetime.datetime.now(), 'Reading quadtree')
    quadtree_poi_filename = path_quadtree + '%s_personal_osm_poi_lv17.json.gz' % area
    fout = gzip.GzipFile(quadtree_poi_filename, 'r')
    quadtree = json.loads(fout.readline())
    fout.close()

    print(datetime.datetime.now(), 'Reading quadtree features')
    quadtree_features_filename = path_quadtree + '%s_quadtree_features.json.gz' % area
    fout = gzip.GzipFile(quadtree_features_filename, 'r')
    quadtrees_features_str = json.loads(fout.readline())
    quadtrees_features = {int(k): v for k, v in quadtrees_features_str.items()}
    fout.close()

    processed_users = set()
    if overwrite:
        for index in range(0, 7):
            output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % (
                area, type_user, index)
            if os.path.exists(output_filename):
                os.remove(output_filename)
    else:
        processed_users = set()
        for index in range(0, 7):
            output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % (
                area, type_user, index)
            if os.path.isfile(output_filename):
                fout = gzip.GzipFile(output_filename, 'r')
                for row in fout:
                    customer_obj = json.loads(row)
                    processed_users.add(customer_obj['uid'])
                fout.close()

    window = 4
    datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00',
                                               '%Y-%m-%d %H:%M:%S')
    datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00',
                                             '%Y-%m-%d %H:%M:%S')

    print(datetime.datetime.now(), 'Generating month boundaries')
    months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS')
    boundaries = [[lm, um]
                  for lm, um in zip(months[:-window], months[window:])]
    training_months = list()
    test_months = list()
    for i in range(len(boundaries) - 1):
        training_months.append(boundaries[i])
        test_months.append(boundaries[i + 1])

    index = 0
    tr_data_map = dict()
    ts_data_map = dict()
    for tr_months, ts_months in zip(training_months, test_months):
        tr_data_map[tuple(tr_months)] = index
        ts_data_map[tuple(ts_months)] = index
        index += 1

    print(datetime.datetime.now(), 'Initializing quadtree features')
    tr_quadtree_features = dict()
    for m in quadtrees_features:
        for lu, index in tr_data_map.items():
            if lu[0].month <= m < lu[1].month:
                if index not in tr_quadtree_features:
                    tr_quadtree_features[index] = dict()
                for path in quadtrees_features[m]:
                    if path not in tr_quadtree_features[index]:
                        tr_quadtree_features[index][path] = {
                            'nbr_traj_start': 0,
                            'nbr_traj_stop': 0,
                            'nbr_traj_move': 0,
                            'traj_speed_sum': 0,
                            'traj_speed_count': 0,
                            'nbr_evnt_A': 0,
                            'nbr_evnt_B': 0,
                            'nbr_evnt_C': 0,
                            'nbr_evnt_Q': 0,
                            'nbr_evnt_start': 0,
                            'nbr_evnt_stop': 0,
                            'speed_A_sum': 0,
                            'max_acc_A_sum': 0,
                            'avg_acc_A_sum': 0,
                            'speed_B_sum': 0,
                            'max_acc_B_sum': 0,
                            'avg_acc_B_sum': 0,
                            'speed_C_sum': 0,
                            'max_acc_C_sum': 0,
                            'avg_acc_C_sum': 0,
                            'speed_Q_sum': 0,
                            'max_acc_Q_sum': 0,
                            'avg_acc_Q_sum': 0,
                            'nbr_crash': 0,
                        }
                    for k, v in quadtrees_features[m][path].items():
                        tr_quadtree_features[index][path][k] += v

    ts_quadtree_features = dict()
    for m in quadtrees_features:
        for lu, index in tr_data_map.items():
            if lu[0].month <= m < lu[1].month:
                if index not in ts_quadtree_features:
                    ts_quadtree_features[index] = dict()
                for path in quadtrees_features[m]:
                    if path not in ts_quadtree_features[index]:
                        ts_quadtree_features[index][path] = {
                            'nbr_traj_start': 0,
                            'nbr_traj_stop': 0,
                            'nbr_traj_move': 0,
                            'traj_speed_sum': 0,
                            'traj_speed_count': 0,
                            'nbr_evnt_A': 0,
                            'nbr_evnt_B': 0,
                            'nbr_evnt_C': 0,
                            'nbr_evnt_Q': 0,
                            'nbr_evnt_start': 0,
                            'nbr_evnt_stop': 0,
                            'speed_A_sum': 0,
                            'max_acc_A_sum': 0,
                            'avg_acc_A_sum': 0,
                            'speed_B_sum': 0,
                            'max_acc_B_sum': 0,
                            'avg_acc_B_sum': 0,
                            'speed_C_sum': 0,
                            'max_acc_C_sum': 0,
                            'avg_acc_C_sum': 0,
                            'speed_Q_sum': 0,
                            'max_acc_Q_sum': 0,
                            'avg_acc_Q_sum': 0,
                            'nbr_crash': 0,
                        }
                    for k, v in quadtrees_features[m][path].items():
                        ts_quadtree_features[index][path][k] += v

    print(datetime.datetime.now(), 'Connecting to database')
    con = database_io.get_connection()
    cur = con.cursor()

    count = 0
    imn_filedata = gzip.GzipFile(
        path_imn + '%s_imn_%s.json.gz' % (area, type_user), 'r')

    print(datetime.datetime.now(),
          'Calculating features and partitioning dataset')
    for row in imn_filedata:
        if len(row) <= 1:
            print('new file started ;-)')
            continue

        user_obj = json.loads(row)
        uid = user_obj['uid']
        count += 1
        if uid in processed_users:
            continue

        if count % 10 == 0:
            print(
                datetime.datetime.now(),
                'train test partition %s %s [%s/%s] - %.2f' %
                (area, type_user, count, nbr_users, 100 * count / nbr_users))

        imh = database_io.load_individual_mobility_history(
            cur, uid, traj_table, min_length, min_duration)
        events = database_io.load_individual_event_history(
            cur, uid, evnt_table) if evnt_table is not None else None
        trajectories = imh['trajectories']

        tr_data = dict()
        ts_data = dict()

        # partitioning imn for train and test
        for imn_months in user_obj:
            if imn_months == 'uid':
                continue

            # print(imn_months)
            m0 = int(imn_months.split('-')[0])
            m1 = int(imn_months.split('-')[1])
            for lu, index in tr_data_map.items():
                if lu[0].month <= m0 < m1 < lu[1].month:
                    if index not in tr_data:
                        tr_data[index] = {
                            'uid': uid,
                            'crash': False,
                            'trajectories': dict(),
                            'imns': dict(),
                            'events': dict(),
                        }
                    tr_data[index]['imns'][imn_months] = user_obj[imn_months]

            for lu, index in ts_data_map.items():
                if lu[0].month <= m0 < lu[1].month:
                    if index not in ts_data:
                        ts_data[index] = {
                            'uid': uid,
                            'crash': False,
                            'trajectories': dict(),
                            'imns': dict(),
                            'events': dict(),
                        }
                    ts_data[index]['imns'][imn_months] = user_obj[imn_months]

        # partitioning trajectories for train and test
        for tid, traj in trajectories.items():
            for lu, index in tr_data_map.items():
                if lu[0] <= traj.start_time() < lu[1] and index in tr_data:
                    tr_data[index]['trajectories'][tid] = traj
            for lu, index in ts_data_map.items():
                if lu[0] <= traj.start_time() < lu[1] and index in ts_data:
                    ts_data[index]['trajectories'][tid] = traj

        # partitioning events for train and test
        for eid, evnt in events.items():
            # print(evnt)
            for lu, index in tr_data_map.items():
                if lu[0] <= evnt[0]['date'] < lu[1] and index in tr_data:
                    tr_data[index]['events'][eid] = evnt[0]
            for lu, index in ts_data_map.items():
                if lu[0] <= evnt[0]['date'] < lu[1] and index in ts_data:
                    ts_data[index]['events'][eid] = evnt[0]

        # get has crash next month
        for lu, index in tr_data_map.items():
            if index not in tr_data:
                continue
            query = """SELECT * FROM %s WHERE uid = '%s' 
                        AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') 
                        AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % (
                crash_table, uid, str(
                    lu[1]), str(lu[1] + relativedelta(months=1)))
            cur.execute(query)
            rows = cur.fetchall()
            has_crash_next_month = len(rows) > 0
            tr_data[index]['crash'] = has_crash_next_month

        for lu, index in ts_data_map.items():
            if index not in ts_data:
                continue
            query = """SELECT * FROM %s WHERE uid = '%s' 
                        AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') 
                        AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % (
                crash_table, uid, str(
                    lu[1]), str(lu[1] + relativedelta(months=1)))
            cur.execute(query)
            rows = cur.fetchall()
            has_crash_next_month = len(rows) > 0
            ts_data[index]['crash'] = has_crash_next_month

        tr_features, ts_features = feature_extractor.extract_features(
            uid, tr_data, ts_data, quadtree, tr_quadtree_features,
            ts_quadtree_features)

        for index in tr_features:
            if index in ts_features:
                output_filename = path_traintest + '%s_%s_traintest_%s.json.gz' % (
                    area, type_user, index)
                store_obj = {
                    'uid': uid,
                    'train': tr_features[index],
                    'test': ts_features[index]
                }
                feature_extractor.store_features(output_filename, store_obj)

    imn_filedata.close()
コード例 #6
0
def main():

    filename = "exp.csv"
    if os.path.isfile(filename):
        os.remove(filename)

    with open(filename, 'a', newline='\n') as file:
        writer = csv.writer(file)
        writer.writerow([
            "initial_threshold", "uid", "len(alltraj)", "M1 len(traj_list)",
            "M1 user_temporal_thresholds", "M1 avg_nbr_points",
            "M1 time_precision", "M1 dist_coverage", "M1 mobility_f1",
            "M2 len(traj_list)", "M2 user_temporal_thresholds",
            "M2 avg_nbr_points", "M2 time_precision", "M2 dist_coverage",
            "M2 mobility_f1", "len(traj_list_random )",
            "avg_nbr_points_random", "time_precision_random",
            "dist_coverage_random", "mobility_random_f1",
            "len(traj_list_random4 )", "avg_nbr_points_random4",
            "time_precision_random4", "dist_coverage_random4",
            "mobility_random4_f1"
        ])

    input_table = 'tak.italy_traj'
    # con = database_io.get_connection()
    # cur = con.cursor()
    # users_list = database_io.extract_users_list('tak.italy_traj', cur)
    # cur.close()
    # con.close()

    users_list = [
        '100006', '100022', '100026', '10008', '100086', '100087', '100088',
        '100090', '100100', '100117'
    ]

    # uid = users_list[0]
    # con = database_io.get_connection()
    # cur = con.cursor()
    # imh = database_io.load_individual_mobility_history(cur, uid, input_table)
    # cur.close()
    # con.close()

    con = database_io.get_connection()
    cur = con.cursor()

    #users_list = database_io.extract_users_list('tak.italy_traj', cur)

    eval_adaptive = list()
    eval_fix1200 = list()
    eval_random = list()
    eval_random2 = list()
    traj_number = list()

    thresholds = [60, 120, 180, 240]

    for t in thresholds:
        for uid in users_list:

            print(uid, input_table)
            imh = database_io.load_individual_mobility_history(
                cur, uid, input_table)

            trajectories = imh['trajectories']
            alltraj = merge_trajectories(trajectories)

            #metodo 1: funzione che usa la mediana mobile
            traj_list1, user_temporal_thr1 = segment_trajectories_user_adaptive(
                alltraj,
                uid,
                temporal_thr=t,
                spatial_thr=50,
                max_speed=0.07,
                gap=60,
                max_lim=3600 * 48,
                window=15,
                smooth_fun=moving_median,
                min_size=10,
                return_cut=True)

            avg_nbr_points1 = np.mean([len(t) for t in traj_list1])
            print('user_temporal_thr', user_temporal_thr1)
            print('NT %d - ANP %.2f' % (len(traj_list1), avg_nbr_points1))
            time_precision1, dist_coverage1, mobility1_f1 = evalaute_segmentation(
                alltraj, traj_list1, print_report=True)
            eval_adaptive.append(
                (time_precision1, dist_coverage1, mobility1_f1))

            #metodo 2: funzione semplice
            traj_list2 = segment_trajectories(alltraj,
                                              uid,
                                              temporal_thr=1200,
                                              spatial_thr=50,
                                              max_speed=0.07)
            avg_nbr_points2 = np.mean([len(t) for t in traj_list2])
            user_temporal_thr2 = 1200
            print('NT %d - ANP %.2f' % (len(traj_list2), avg_nbr_points2))
            time_precision2, dist_coverage2, mobility2_f1 = evalaute_segmentation(
                alltraj, traj_list2, print_report=True)
            eval_fix1200.append(
                (time_precision2, dist_coverage2, mobility2_f1))

            #metodo 3: funzione random
            traj_list_random = segment_trajectories_random(alltraj,
                                                           uid,
                                                           nbr_traj=2000)
            avg_nbr_points_random = np.mean([len(t) for t in traj_list_random])
            print('NT %d - ANP %.2f' %
                  (len(traj_list_random), avg_nbr_points_random))
            time_precision_random, dist_coverage_random, mobility_random_f1 = evalaute_segmentation(
                alltraj, traj_list_random, print_report=True)
            eval_random.append((time_precision_random, dist_coverage_random,
                                mobility_random_f1))

            #metodo 4: funzione random con nbr_traj_max dato dal primo metodo
            traj_list_random4 = segment_trajectories_random(
                alltraj, uid, nbr_traj_min=2, nbr_traj_max=len(traj_list2))
            avg_nbr_points_random4 = np.mean(
                [len(t) for t in traj_list_random4])
            print('NT %d - ANP %.2f' %
                  (len(traj_list_random4), avg_nbr_points_random4))
            time_precision_random4, dist_coverage_random4, mobility_random4_f1 = evalaute_segmentation(
                alltraj, traj_list_random4, print_report=True)
            eval_random2.append((time_precision_random4, dist_coverage_random4,
                                 mobility_random4_f1))

            #riempire file

            with open(filename, 'a', newline='\n') as file:
                writer = csv.writer(file)
                writer.writerow([
                    t, uid,
                    len(alltraj),
                    len(traj_list1), user_temporal_thr1, avg_nbr_points1,
                    time_precision1, dist_coverage1, mobility1_f1,
                    len(traj_list2), user_temporal_thr2, avg_nbr_points2,
                    time_precision2, dist_coverage2, mobility2_f1,
                    len(traj_list_random), avg_nbr_points_random,
                    time_precision_random, dist_coverage_random,
                    mobility_random_f1,
                    len(traj_list_random4), avg_nbr_points_random4,
                    time_precision_random4, dist_coverage_random4,
                    mobility_random4_f1
                ])
コード例 #7
0
def main():
    area = sys.argv[1]

    country = 'uk' if area == 'london' else 'italy'
    overwrite = True
    depth = 16
    store_evry = 100

    path = './'
    path_dataset = path + 'dataset/'
    path_quadtree = path + 'quadtree/'

    traj_table = 'tak.%s_traj' % country
    evnt_table = 'tak.%s_evnt' % country
    crash_table = 'tak.%s_crash' % country

    users_filename = path_dataset + '%s_all_users_list.csv' % area
    quadtree_output_filename = path_quadtree + '%s_quadtree_features.json.gz' % area
    quadtrees_features = dict()

    datetime_from = datetime.datetime.strptime('2017-01-01 00:00:00',
                                               '%Y-%m-%d %H:%M:%S')
    datetime_to = datetime.datetime.strptime('2018-01-01 00:00:00',
                                             '%Y-%m-%d %H:%M:%S')

    months = pd.date_range(start=datetime_from, end=datetime_to, freq='MS')
    boundaries = [[lm, um] for lm, um in zip(months[:-1], months[1:])]

    index = 0
    data_map = dict()
    for months in boundaries:
        data_map[tuple(months)] = index
        quadtrees_features[index] = dict()
        index += 1

    users_list = sorted(pd.read_csv(users_filename).values[:, 0].tolist())

    last_processed_user = None
    if os.path.isfile(quadtree_output_filename) and not overwrite:
        fout = gzip.GzipFile(quadtree_output_filename, 'r')
        quadtrees_features_str = json.loads(fout.readline())
        quadtrees_features = {
            int(k): v
            for k, v in quadtrees_features_str.items()
        }
        last_processed_user = json.loads(fout.readline())
        fout.close()

    con = database_io.get_connection()
    cur = con.cursor()

    for i, uid in enumerate(users_list):

        if last_processed_user is not None and uid <= last_processed_user:
            continue

        if i % store_evry == 0:
            print(
                datetime.datetime.now(),
                '%s %s %.2f' % (traj_table, area, i / len(users_list) * 100.0))

        trajectories = database_io.load_individual_mobility_history(
            cur, uid, traj_table, min_length=1.0,
            min_duration=60.0)['trajectories']
        events = database_io.load_individual_event_history(
            cur, uid, evnt_table)

        quadtree_data = dict()

        # partitioning trajectories for train and test
        for tid, traj in trajectories.items():
            for lu, index in data_map.items():
                if lu[0] <= traj.start_time() < lu[1]:
                    if index not in quadtree_data:
                        quadtree_data[index] = {
                            'uid': uid,
                            'crash': None,
                            'trajectories': dict(),
                            'events': dict(),
                        }
                    quadtree_data[index]['trajectories'][tid] = traj

        # partitioning events for train and test
        for eid, evnt in events.items():
            for lu, index in data_map.items():
                if lu[0] <= evnt[0]['date'] < lu[1] and index in quadtree_data:
                    quadtree_data[index]['events'][eid] = evnt[0]

        # get has crash this month
        for lu, index in data_map.items():
            if index not in quadtree_data:
                continue
            query = """SELECT lat, lon FROM %s WHERE uid = '%s' 
                        AND date >= TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS') 
                        AND date < TO_TIMESTAMP('%s','YYYY-MM-DD HH24:MI:SS')""" % (
                crash_table, uid, str(lu[0]), str(lu[1]))
            cur.execute(query)
            rows = cur.fetchall()
            if len(rows) > 0:
                quadtree_data[index]['crash'] = {
                    'lat': float(rows[0][0]),
                    'lon': float(rows[0][1])
                }

        quadtrees_features = quadtrees_features_extract(
            quadtrees_features, quadtree_data, depth)

        if i % store_evry == 0:
            json_str_quadtree = '%s\n' % json.dumps(quadtrees_features)
            json_bytes_quadtree = json_str_quadtree.encode('utf-8')
            json_str_lpu = '%s\n' % json.dumps(last_processed_user)
            json_bytes_lpu = json_str_lpu.encode('utf-8')
            with gzip.GzipFile(quadtree_output_filename, 'w') as fout:
                fout.write(json_bytes_quadtree)
                fout.write(json_bytes_lpu)
            last_processed_user = uid
コード例 #8
0
def imn_extract(filename,
                path,
                type_user,
                traj_table,
                evnt_table,
                min_traj_nbr,
                min_length,
                min_duration,
                area,
                overwrite=False,
                users_filename_crash=None):

    output_filename = path + '%s_imn_%s.json.gz' % (area, type_user)

    con = database_io.get_connection()
    cur = con.cursor()

    users_list = pd.read_csv(filename).values[:, 0].tolist()
    users_list = sorted(users_list)

    if users_filename_crash is not None:
        users_list_crash = pd.read_csv(
            users_filename_crash).values[:, 0].tolist()
        users_list_crash = sorted(users_list_crash)
        users_list = [uid for uid in users_list if uid not in users_list_crash]

    nbr_users = len(users_list)
    print(nbr_users, len(users_list))
    if os.path.isfile(output_filename) and not overwrite:
        processed_users = list()
        fout = gzip.GzipFile(output_filename, 'r')
        # count = 0
        for row in fout:
            customer_obj = json.loads(row)
            processed_users.append(customer_obj['uid'])
            # print(customer_obj['uid'])
            # if count == 100:
            #     break
            # count += 1
        fout.close()
        users_list = [uid for uid in users_list if uid not in processed_users]

    print(nbr_users, len(users_list))
    # from_perc = 95
    # to_perc = 100
    for i, uid in enumerate(users_list):

        # if not from_perc < i / len(users_list) * 100.0 <= to_perc:
        #     continue

        if i % 1 == 0:
            print(
                datetime.datetime.now(),
                '%s %s %s [%s/%s] - %.2f' % (traj_table, area, type_user, i,
                                             nbr_users, i / nbr_users * 100.0))
            # print(datetime.datetime.now(), '%s %s %s %.2f' % (
            # traj_table, area, type_user, i / len(users_list) * 100.0), from_perc, to_perc)

        imh = database_io.load_individual_mobility_history(
            cur, uid, traj_table, min_length, min_duration)
        events = database_io.load_individual_event_history(
            cur, uid, evnt_table) if evnt_table is not None else None

        if len(imh['trajectories']) < min_traj_nbr:
            # print('len trajectories]) < min_traj_nbr', len(imh['trajectories']), min_traj_nbr)
            continue

        # print(len(events))
        # print(list(events.keys()))

        wimh_dict = dict()
        wevents_dict = dict()
        for tid, traj in imh['trajectories'].items():
            st = traj.start_time()
            stk_list = start_time_map(st)
            for stk in stk_list:
                if stk is None:
                    continue
                if stk not in wimh_dict:
                    wimh_dict[stk] = {'uid': uid, 'trajectories': dict()}
                    wevents_dict[stk] = dict()
                wimh_dict[stk]['trajectories'][tid] = traj
                if tid in events:
                    wevents_dict[stk][tid] = events[tid]

        customer_obj = {'uid': uid}
        for stk in wimh_dict:
            wimh = wimh_dict[stk]
            wevents = wevents_dict[stk]
            # print(stk, len(wimh['trajectories']), len(wevents))
            if len(wimh['trajectories']) < min_traj_nbr // 12:
                continue

            imn = individual_mobility_network.build_imn(wimh,
                                                        reg_loc=True,
                                                        events=wevents,
                                                        verbose=False)
            customer_obj[stk] = imn

        json_str = '%s\n' % json.dumps(clear_tuples4json(customer_obj),
                                       default=agenda_converter)
        json_bytes = json_str.encode('utf-8')
        with gzip.GzipFile(output_filename, 'a') as fout:
            fout.write(json_bytes)
        # with gzip.GzipFile(output_filename.replace('.json.gz', '_%s_%s.json.gz' % (from_perc, to_perc)), 'a') as fout:
        #     fout.write(json_bytes)

    cur.close()
    con.close()
コード例 #9
0
def main():
    input_table = 'tak.uk_traj'
    path = '/home/agnese/PycharmProjects/TrajectorySegmentation/Risultati/'
    filename = 'LONDON_traj_seg_exp2000.csv'
   # data = pd.read_csv("/home/agnese/PycharmProjects/TrajectorySegmentation/Results/" + "traj_seg_exp100.csv")

    header = ['input_table', 'uid', 'nbr_points', 'avg_sampling_rate', 'std_sampling_rate', 'med_sampling_rate',
              'method', 'nbr_traj', 'avg_nbr_points', 'avg_length', 'avg_duration',
              'avg_sampling_rate_traj', 'std_sampling_rate_traj', 'med_sampling_rate_traj',
              'time_precision', 'dist_coverage', 'mobility_f1', 'temporal_thr']

    processed_users = list()
    if os.path.isfile(filename):
        # os.remove(filename)
        df = pd.read_csv(path+filename)
        processed_users = list(df['uid'])
        fileout = open(filename, 'a')
    else:
        fileout = open(filename, 'w')
        fileout.write('%s\n' % (','.join(header)))
        fileout.flush()

    # users_list = ['100006',
    #               '100022',
    #               '100026',
    #               '10008',
    #               '100086',
    #               '100087',
    #               '100088',
    #               '100090',
    #               '100100',
    #               '100117']

    # con = database_io.get_connection()
    # cur = con.cursor()
    # users_list = database_io.extract_users_list('tak.italy_traj', cur)
    # cur.close()
    #
    con = database_io.get_connection()
    cur = con.cursor()
    users_list = pd.read_csv(path+'london_all_users_list.csv')
    print(users_list.head())


    users_list= users_list['uid'].tolist()

    # return -1

    #users_list = database_io.extract_users_list('tak.uk_traj', cur)
    # users_list = map(int, users_list)
    # print(users_list)

    # users_list = [int(uid) for uid in users_list]
    print(len(users_list))

    count = 0
    nbr_exp = 2000
    #for i, uid in enumerate(users_list):
        #print(datetime.datetime.now(), uid, input_table, '[%s/%s]' % (i, len(users_list)))
        #results = run(cur, uid, input_table)
        #for j, res in enumerate(results):
        #    fileout.write('%s\n' % (','.join([str(r) for r in res])))
        #    f1_dict[res[6]].append(res[-2])
        #    tp_dict[res[6]].append(res[-4])
        #fileout.flush()
    for i, uid in enumerate(users_list):
        print(datetime.datetime.now(), uid, input_table, '[%s/%s]' % (i, len(users_list)))
        if uid in processed_users:
            count+=1
            if count>= nbr_exp:
                break
            continue
        try:
           results = run(cur, uid, input_table)
           for j, res in enumerate(results):
               fileout.write('%s\n' % (','.join([str(r) for r in res])))
               fileout.flush()
        except Exception:
            print(datetime.datetime.now(), uid, input_table, 'Error')
            continue

        count += 1
        if count >= nbr_exp:
            break

    fileout.flush()
    cur.close()
    con.close()

    fileout.close()
コード例 #10
0
def main():
    input_table = 'tak.italy_traj'
    # con = database_io.get_connection()
    # cur = con.cursor()
    # users_list = database_io.extract_users_list('tak.italy_traj', cur)
    # cur.close()
    # con.close()

    users_list = [
        '100006', '100022', '100026', '10008', '100086', '100087', '100088',
        '100090', '100100', '100117'
    ]

    # uid = users_list[0]
    # con = database_io.get_connection()
    # cur = con.cursor()
    # imh = database_io.load_individual_mobility_history(cur, uid, input_table)
    # cur.close()
    # con.close()

    con = database_io.get_connection()
    cur = con.cursor()

    eval_adaptive = list()
    eval_fix1200 = list()
    eval_random = list()

    for uid in users_list:

        print(uid, input_table)
        imh = database_io.load_individual_mobility_history(
            cur, uid, input_table)

        trajectories = imh['trajectories']
        alltraj = merge_trajectories(trajectories)
        traj_list, user_temporal_thr = segment_trajectories_user_adaptive(
            alltraj,
            uid,
            temporal_thr=60,
            spatial_thr=50,
            max_speed=0.07,
            gap=60,
            max_lim=3600 * 48,
            window=15,
            smooth_fun=moving_median,
            min_size=10,
            return_cut=True)
        avg_nbr_points = np.mean([len(t) for t in traj_list])
        print('user_temporal_thr', user_temporal_thr)
        print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points))
        time_precision, dist_coverage, mobility_f1 = evalaute_segmentation(
            alltraj, traj_list, print_report=True)
        eval_adaptive.append((time_precision, dist_coverage, mobility_f1))

        print('------')

        traj_list = segment_trajectories(alltraj,
                                         uid,
                                         temporal_thr=1200,
                                         spatial_thr=50,
                                         max_speed=0.07)
        avg_nbr_points = np.mean([len(t) for t in traj_list])
        print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points))
        time_precision, dist_coverage, mobility_f1 = evalaute_segmentation(
            alltraj, traj_list, print_report=True)
        eval_fix1200.append((time_precision, dist_coverage, mobility_f1))

        print('------')

        traj_list = segment_trajectories(alltraj,
                                         uid,
                                         temporal_thr=120,
                                         spatial_thr=50,
                                         max_speed=0.07)
        avg_nbr_points = np.mean([len(t) for t in traj_list])
        print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points))
        time_precision, dist_coverage, mobility_f1 = evalaute_segmentation(
            alltraj, traj_list, print_report=True)

        print('------')

        traj_list = segment_trajectories_random(alltraj, uid, nbr_traj=2000)
        avg_nbr_points = np.mean([len(t) for t in traj_list])
        print('NT %d - ANP %.2f' % (len(traj_list), avg_nbr_points))
        time_precision, dist_coverage, mobility_f1 = evalaute_segmentation(
            alltraj, traj_list, print_report=True)
        eval_random.append((time_precision, dist_coverage, mobility_f1))

    cur.close()
    con.close()
    print('')

    print('ADP - TP: %.3f - DC: %.3f - F1: %.3f' % (np.median(
        [v[0] for v in eval_adaptive]), np.median(
            [v[1]
             for v in eval_adaptive]), np.median([v[2]
                                                  for v in eval_adaptive])))

    print('FIX - TP: %.3f - DC: %.3f - F1: %.3f' % (np.median(
        [v[0] for v in eval_fix1200]), np.median(
            [v[1]
             for v in eval_fix1200]), np.median([v[2] for v in eval_fix1200])))

    print('ADP - TP: %.3f - DC: %.3f - F1: %.3f' % (np.mean(
        [v[0] for v in eval_adaptive]), np.mean(
            [v[1]
             for v in eval_adaptive]), np.mean([v[2] for v in eval_adaptive])))

    print('FIX - TP: %.3f - DC: %.3f - F1: %.3f' % (np.mean(
        [v[0] for v in eval_fix1200]), np.mean(
            [v[1]
             for v in eval_fix1200]), np.mean([v[2] for v in eval_fix1200])))

    print('ADP - TP: %.3f - DC: %.3f - F1: %.3f' % (np.std(
        [v[0] for v in eval_adaptive]), np.std(
            [v[1]
             for v in eval_adaptive]), np.std([v[2] for v in eval_adaptive])))

    print(
        'FIX - TP: %.3f - DC: %.3f - F1: %.3f' %
        (np.std([v[0]
                 for v in eval_fix1200]), np.std([v[1] for v in eval_fix1200]),
         np.std([v[2] for v in eval_fix1200])))