Esempi in Python per chunks, esempi in Python per utils.util.chunks

Esempio n. 1

0

Mostra file

File: evaluation.py Progetto: iamqingmei/NSE_Deep_Learning

def smooth_is_vehicle(df_trip,
                      vehicle_or_not,
                      non_vehi_seg_min_dura=1 * 60,
                      vehi_seg_min_dura=2.5 * 60):
    vehicle_or_not_chunks = list(chunks(vehicle_or_not))
    dt_all = df_trip['TIME_DELTA']
    vehicle_or_not_smoothed = vehicle_or_not
    # remove short non-vehicle segments between vehicle segments
    num_chunks = len(vehicle_or_not_chunks)
    for idx, chunk in enumerate(vehicle_or_not_chunks):
        if idx != 0 and idx != num_chunks - 1 and vehicle_or_not[
                chunk[0]] == 0:
            chunk_dura = sum(dt_all[chunk[0]:chunk[1]])
            if chunk_dura < non_vehi_seg_min_dura:
                vehicle_or_not_smoothed[chunk[0]:chunk[1]] = [1] * (chunk[1] -
                                                                    chunk[0])
    # remove vehicle segments which are still short after combining
    is_vehicle_chunks = list(chunks(vehicle_or_not_smoothed))
    for chunk in is_vehicle_chunks:
        if vehicle_or_not[chunk[0]] == 1:
            chunk_dura = sum(dt_all[chunk[0]:chunk[1]])
            if chunk_dura < vehi_seg_min_dura:
                vehicle_or_not_smoothed[chunk[0]:chunk[1]] = [0] * (chunk[1] -
                                                                    chunk[0])

    del vehicle_or_not_chunks, dt_all, num_chunks, is_vehicle_chunks
    return vehicle_or_not_smoothed

Esempio n. 2

0

Mostra file

 def get_affiliation_data(self):
     try:
         affiliation_ids = RedisTemplate.get(AFFILIATION_ALL_ID)
         affiliation_ids = json.loads(affiliation_ids)
         self.affiliation_ids = affiliation_ids
         for ids in chunks(self.affiliation_ids, 500):
             article_keys = [
                 AFFILIATION_RELATED_ARTICLE_ID_KEY_TEMPLATE.format(i)
                 for i in ids
             ]
             article_values = RedisTemplate.mget(keys=article_keys)
             article_id_dict = {}
             author_keys = [
                 AFFILIATION_RELATED_AUTHOR_ID_KEY_TEMPLATE.format(i)
                 for i in ids
             ]
             author_values = RedisTemplate.mget(keys=author_keys)
             author_id_dict = {}
             for i in range(len(ids)):
                 if article_values[i] is None:
                     article_id_dict[ids[i]] = []
                 else:
                     article_id_dict[ids[i]] = json.loads(article_values[i])
                 if author_values[i] is None:
                     author_id_dict[ids[i]] = []
                 else:
                     author_id_dict[ids[i]] = json.loads(author_values[i])
             self.related_article_dict.update(article_id_dict)
             self.related_author_dict.update(author_id_dict)
     except Exception:
         traceback.format_exc()
         return

Esempio n. 3

0

Mostra file

File: evaluation.py Progetto: iamqingmei/NSE_Deep_Learning

def smooth_vehicle_type(df_trip,
                        original_result_label,
                        vehi_seg_min_dura=1 * 60):
    trip_segments = list(chunks(original_result_label, True))
    dt_all = df_trip['TIME_DELTA']
    res = original_result_label

    num_chunks = len(trip_segments)

    for idx, chunk in enumerate(trip_segments):
        if idx == 0 or idx == num_chunks - 1 or chunk[2] == 5:
            continue
        pre_label = trip_segments[idx - 1][2]
        post_label = trip_segments[idx + 1][2]
        if pre_label == post_label:
            if pre_label == 5:
                continue
            else:
                cur_chunk_duration = sum(dt_all[chunk[0]:chunk[1]])
                if cur_chunk_duration < vehi_seg_min_dura:
                    res[chunk[0]:chunk[1]] = [pre_label
                                              ] * (chunk[1] - chunk[0])

    del trip_segments, dt_all, num_chunks
    return res

Esempio n. 4

0

Mostra file

def update_affiliation_keyword_job():
    affiliation = AffiliationLoader()
    affiliation.get_affiliation_data()
    related_article_list = sorted(affiliation.related_article_dict.items(),
                                  key=lambda x: x[0],
                                  reverse=False)
    related_keyword_dict = {}
    sql = '''SELECT keyword_id,keyword_desc,COUNT(article_id)AS num FROM keyword_article
        WHERE article_id IN %s
        GROUP BY keyword_id,keyword_desc
        ORDER BY num DESC'''
    for affiliations_articles in chunks(related_article_list, 500):
        related_dict = {}
        for affiliation_articles in affiliations_articles:
            affiliation_id = affiliation_articles[0]
            articles = affiliation_articles[1]

            #机构没有对应的文章
            if not articles or len(articles) == 0:
                continue

            Cursor.execute(sql, (articles, ))
            raw_result = list(Cursor.fetchall())
            if raw_result is None:
                continue
            keywords = list(map(parseKeyword, raw_result))
            related_dict[affiliation_id] = keywords
        related_keyword_dict.update(related_dict)
    print("{} related_keyword_dict_len: {}".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
        len(related_keyword_dict)))
    pipeline = RedisTemplate.pipeline()
    for articles in chunks(related_article_list, 500):
        for article in articles:
            article_key = AFFILIATION_RELATED_KEYWORD_KEY_TEMPLATE.format(
                article[0])
            keywords = related_keyword_dict.get(article[0])
            if keywords:
                pipeline.set(article_key, json.dumps(keywords))
        pipeline.execute()
        time.sleep(1)
    print("{} update_affiliation_keyword_job".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

Esempio n. 5

0

Mostra file

File: author_collaboration.py Progetto: NJU-SaltyFishes/schedule_task

def update_author_collaboration_job():
    sql = 'SELECT id FROM author'
    Cursor.execute(sql)
    author_list = list(map(lambda x: x[0], list(Cursor.fetchall())))
    update_sql = '''
        INSERT INTO author_collaboration(start_id,end_id,distance,predict_collaboration)
        VALUES (%s,%s,%s,%s)
    '''
    author_collaboration_list = []
    wfile = open(
        "/Users/Karl/Desktop/SoftwareExercise/authorCollaboration.txt",
        "a+",
        encoding="utf-8")
    for authors in chunks(author_list, 500):
        start_time = time.time()
        for author in authors:
            with Neo4jDriver.session() as session:
                res = session.read_transaction(searchCoAuthor, author)
                # data = []
                # for record in res:
                #     data.append(record["authorId"])
                # author_collaboration_dict[author]=data
                #     wfile.write(json.dumps(author_collaboration_dict, indent=4))
                #     author_collaboration_dict.clear()
                #     end_time = time.time()
                #     duration = end_time - start_time
                #     print('update_author_collaboration_job runtime is:{0:.3f}s'.format(duration))
                # wfile.close()

                for coAuthor in res:
                    jaccrdDistance = computeJaccrdDistance(
                        author, coAuthor["authorId"])
                    print((author, coAuthor["authorId"],
                           round(jaccrdDistance[0],
                                 2), json.dumps(jaccrdDistance[1])))
                    author_collaboration_list.append(
                        (author, coAuthor["authorId"],
                         round(jaccrdDistance[0],
                               2), json.dumps(jaccrdDistance[1])))
        try:
            Cursor.executemany(update_sql, author_collaboration_list)
            Connection.commit()
        except Exception as e:
            print(e)
            Connection.rollback()
        end_time = time.time()
        duration = end_time - start_time
        print('update_author_collaboration_job 500 runtime is:{0:.3f}s'.format(
            duration))
        time.sleep(1)

Esempio n. 6

0

Mostra file

File: new_publish_atricle.py Progetto: NJU-SaltyFishes/schedule_task

def update_affiliation_new_article_job():
    affiliation = AffiliationLoader()
    affiliation.get_affiliation_data()
    related_article_list = sorted(affiliation.related_article_dict.items(),key=lambda x:x[0],reverse=False)
    related_new_article_dict = {}
    sql = '''SELECT id FROM article
            WHERE id IN %s
            ORDER BY date DESC LIMIT 1'''
    for affiliations_articles in chunks(related_article_list,500):
        related_dict = {}
        for affiliation_articles in affiliations_articles:
            affiliation_id = affiliation_articles[0]
            articles = affiliation_articles[1]

            #机构没有对应的文章
            if not articles or len(articles)==0:
                continue

            Cursor.execute(sql, (articles,))
            raw_result = list(Cursor.fetchone())
            if raw_result is None:
                continue
            article_id = raw_result[0]
            related_dict[affiliation_id] = article_id
        related_new_article_dict.update(related_dict)
    print("{} related_new_article_dict_len: {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                                                             len(related_new_article_dict)))
    pipeline = RedisTemplate.pipeline()
    for articles in chunks(related_article_list,500):
        for article in articles:
            article_key = AFFILIATION_RELATED_NEW_ARTICLE_ID_KEY_TEMPLATE.format(article[0])
            new_article_id = related_new_article_dict.get(article[0])
            if new_article_id:
                pipeline.set(article_key,json.dumps(new_article_id))
        pipeline.execute()
        time.sleep(1)
    print("{} update_affiliation_new_article_job finished".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

Esempio n. 7

0

Mostra file

    def _load_related_article_dict(self):

        for ids in chunks(self.affiliation_ids, 500):
            sql = '''
                    SELECT affiliation_id, group_concat(article_id) as article_ids
                    FROM affiliation_article
                    WHERE affiliation_id IN %s
                    GROUP BY affiliation_id
                '''
            Cursor.execute(sql, (ids, ))
            raw_result = list(Cursor.fetchall())
            related_dict = {}
            for info in raw_result:
                if info is None:
                    continue
                if info[1] is None or len(info) == 0:
                    self.related_article_dict[info[0]] = []
                else:
                    related_dict[info[0]] = info[1].split(',')
            self.related_article_dict.update(related_dict)
            time.sleep(1)
        print("{} related_article_dict_len: {}".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            len(self.related_article_dict)))

Esempio n. 8

0

Mostra file

    def _save_to_redis(self):
        if len(self.affiliation_ids) > 0:
            RedisTemplate.set(AFFILIATION_ALL_ID,
                              json.dumps(self.affiliation_ids))

        pipeline = RedisTemplate.pipeline()
        for ids in chunks(self.affiliation_ids, 500):
            for _id in ids:
                article_key = AFFILIATION_RELATED_ARTICLE_ID_KEY_TEMPLATE.format(
                    _id)
                author_key = AFFILIATION_RELATED_AUTHOR_ID_KEY_TEMPLATE.format(
                    _id)
                related_article_ids = self.related_article_dict.get(_id)
                related_author_ids = self.related_author_dict.get(_id)
                if related_article_ids:
                    pipeline.set(article_key,
                                 json.dumps(self.related_article_dict[_id]))
                if related_author_ids:
                    pipeline.set(author_key,
                                 json.dumps(self.related_author_dict[_id]))
            pipeline.execute()
            time.sleep(1)
        print("{} save_to_redis finished".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

Esempio n. 9

0

Mostra file

def save_tripsummary_PSQL_2016(conn, cur, tablename_trip, tablename_extra,
                               trips_dict):
    """ Save extra point-level information into DB
        Input:
        conn: DB connection
        cur: DB cursor
        tablename_trip: DB table to save the trip dictionary data
        trips_dict: dictionary of trip summaries

        Return True if successful and False otherwise.

    """
    nids = [trips_dict['nid']] * len(trips_dict['trip_num'])
    dates = [trips_dict['analyzed_date']] * len(trips_dict['trip_num'])
    tot_num_trips = [trips_dict['tot_num_trips']] * len(trips_dict['trip_num'])
    if not trips_dict['home_loc'] == [None, None]:
        home_loc = [trips_dict['home_loc']] * len(trips_dict['trip_num'])
    else:
        home_loc = [[]] * len(trips_dict['trip_num'])
    if not trips_dict['school_loc'] == [None, None]:
        school_loc = [trips_dict['school_loc']] * len(trips_dict['trip_num'])
    else:
        school_loc = [[]] * len(trips_dict['trip_num'])
    trip_num = trips_dict['trip_num']
    start_poi_loc = trips_dict['start_poi_loc']
    end_poi_loc = trips_dict['end_poi_loc']
    start_sgt = trips_dict['start_sgt']
    end_sgt = trips_dict['end_sgt']
    tot_dist = trips_dict['tot_dist(km)']
    tot_dura = trips_dict['tot_dura(s)']
    valid_loc_perc = trips_dict['valid_loc_perc']
    num_pt = trips_dict['num_pt']
    manual_label_modes = []
    manual_label_strs = []
    manual_label_finishs = []
    users_ids = []
    time_modified_list = []
    google_label_modes = []
    google_failed_reasons = []
    google_label_finishs = []

    # check existence by id/nid first
    existence, manually_labeled, app_labeled, google_labeled_failed, google_labeled_trusted = checkNidDateExistence(
        cur, tablename_trip, nids[0], dates[0])
    if existence is None:
        logging.error("Existence checking failed!")
        return False
    try:
        if manually_labeled:
            # need to recreate the manually labeled trip-level modes for the new trips
            logging.warning(
                "There are manual labels existing. Start recreating.")
            cur_date_tuple = datetime.strptime(dates[0], "%Y%m%d")
            one_day_after = (cur_date_tuple +
                             timedelta(days=1)).strftime("%Y%m%d")
            # get the triplabel and pt-level label for the whole day
            allQuery = """SELECT triplabel,gt_mode_manual from """ + tablename_extra + """ WHERE nid=""" + str(
                nids[0]
            ) + """ AND (sgt>='""" + dates[
                0] + """ 00:00:00' AND sgt<'""" + one_day_after + """ 00:00:00') ORDER BY sgt"""
            cur.execute(allQuery)
            dataAll = cur.fetchall()
            if len(dataAll) > 0:
                rawColumns = list(zip(*dataAll))
                triplabels = np.array(rawColumns[0])
                gt_mode_manual_list = np.array(rawColumns[1])
                # go through each trip to get the trip-level modes
                for item in trip_num:
                    mode_str_cur_trip = ''
                    # get all labels for this trip
                    gt_mode_manual_cur_trip = gt_mode_manual_list[
                        triplabels == item].tolist()
                    # get mode chunks of this trip
                    mode_chunks = chunks(gt_mode_manual_cur_trip,
                                         include_values=True)
                    for mode_chunk in mode_chunks:
                        if mode_chunk[2] is not None:
                            mode_str_cur_trip += str(mode_chunk[2])
                    manual_label_modes.append(mode_str_cur_trip)
                    manual_label_finishs.append('t')
                    manual_label_strs.append('recreated')
                    users_ids.append(1)
                    time_modified_list.append(
                        datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            else:
                logging.error("Failed to get the pt-level manual labels")

        if google_labeled_trusted:
            # need to recreate the automatically labeled trip-level modes for the new trips, if the labels are trusted
            logging.warning(
                "There are trusted automatic labels existing. Start recreating."
            )
            cur_date_tuple = datetime.strptime(dates[0], "%Y%m%d")
            one_day_after = (cur_date_tuple +
                             timedelta(days=1)).strftime("%Y%m%d")
            # get the triplabel and pt-level label for the whole day
            allQuery = """SELECT triplabel,gt_mode_google from """ + tablename_extra + """ WHERE nid=""" + str(
                nids[0]
            ) + """ AND (sgt>='""" + dates[
                0] + """ 00:00:00' AND sgt<'""" + one_day_after + """ 00:00:00') ORDER BY sgt"""
            cur.execute(allQuery)
            dataAll = cur.fetchall()
            if len(dataAll) > 0:
                rawColumns = list(zip(*dataAll))
                triplabels = np.array(rawColumns[0])
                gt_mode_google_list = np.array(rawColumns[1])
                # go through each trip to get the trip-level modes
                for item in trip_num:
                    mode_str_cur_trip = ''
                    # get all labels for this trip
                    gt_mode_google_cur_trip = gt_mode_google_list[
                        triplabels == item].tolist()
                    # check the percentage of the labeled samples
                    None_mode_cnt = 0
                    for mode_item in gt_mode_google_cur_trip:
                        if mode_item is None:
                            None_mode_cnt += 1
                    if None_mode_cnt > 0.3 * len(gt_mode_google_cur_trip):
                        # if too many samples don't have automatic labels
                        google_label_modes.append(None)
                        google_label_finishs.append('t')
                        google_failed_reasons.append(
                            'Too few labels while recreating')
                    else:
                        # get mode chunks of this trip
                        mode_chunks = chunks(gt_mode_google_cur_trip,
                                             include_values=True)
                        for mode_chunk in mode_chunks:
                            if mode_chunk[2] is not None:
                                mode_str_cur_trip += str(mode_chunk[2])
                        google_label_modes.append(mode_str_cur_trip)
                        google_label_finishs.append('t')
                        google_failed_reasons.append(None)
            else:
                logging.error("Failed to get the pt-level manual labels")
        elif google_labeled_failed:
            logging.warning(
                "There are failed automatic labels existing. Save as failed auto-labeling."
            )
            google_label_modes = [None] * len(trip_num)
            google_label_finishs = ['t'] * len(trip_num)
            google_failed_reasons = ['Failed before recreating'
                                     ] * len(trip_num)
        else:
            google_label_modes = [None] * len(trip_num)
            google_label_finishs = ['f'] * len(trip_num)
            google_failed_reasons = [None] * len(trip_num)

        app_labeled_list = [app_labeled] * len(trip_num)

        if existence:
            # delete the old data if already exists because the new one might have diff number of trips
            logging.warning(
                "Trip exists! DB Connected to delete existing trip summaries")
            deleteQuery = """DELETE FROM """ + tablename_trip + """ WHERE nid=""" + str(
                nids[0]) + """ and analyzed_date='""" + dates[0] + """'"""
            cur.execute(deleteQuery)
            conn.commit()

        if manual_label_modes:
            # insert the new data with existing manual labels
            logging.warning(
                "DB Connected to insert new trip summaries with existing manual labels"
            )
            insertQuery = """INSERT INTO """ + tablename_trip + """ (nid,analyzed_date, tot_num_trips,\
                trip_num,home_loc, school_loc, start_poi_loc, end_poi_loc,tot_dist_km, tot_dura_s,\
                start_sgt,end_sgt, valid_loc_perc,num_pt, manual_label_finish, manual_label_str,\
                manual_label_mode, users_id, time_modified, app_label_finish, google_label_mode,\
                google_label_finish,google_failed_reason) \
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
            zip2save = zip(nids, dates, tot_num_trips, trip_num, home_loc, school_loc, \
                           start_poi_loc, end_poi_loc, tot_dist, tot_dura, start_sgt, end_sgt, valid_loc_perc, num_pt, \
                           manual_label_finishs, manual_label_strs, manual_label_modes, users_ids, time_modified_list, \
                           app_labeled_list, google_label_modes, google_label_finishs, google_failed_reasons)
            cur.executemany(insertQuery, zip2save)
            conn.commit()
            return True
        else:
            # insert the new data w/o any existing manual labels
            logging.warning(
                "DB Connected to insert new trip summaries w/o any existing manual labels"
            )
            insertQuery = """INSERT INTO """ + tablename_trip + """ (nid,analyzed_date,tot_num_trips,\
                trip_num,home_loc,school_loc,start_poi_loc,end_poi_loc,tot_dist_km,tot_dura_s,\
                start_sgt,end_sgt,valid_loc_perc,num_pt, app_label_finish, google_label_mode,\
                google_label_finish,google_failed_reason) \
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
            zip2save = zip(nids, dates, tot_num_trips, trip_num, home_loc, school_loc, \
                           start_poi_loc, end_poi_loc, tot_dist, tot_dura, start_sgt, end_sgt, valid_loc_perc, num_pt, \
                           app_labeled_list, google_label_modes, google_label_finishs, google_failed_reasons)
            cur.executemany(insertQuery, zip2save)
            conn.commit()
            return True

    except psycopg2.DatabaseError as e:
        logging.error(e)
        return False

Esempio n. 10

0

Mostra file

    def predict(self, data, modes):
        """predict whether a list of position follows a train route by detecting
        the nearest train stops. Input is the pandas data frame of
        measurements and an array of current mode predictions.  Returns
        an array of predicted modes of the same size as the input data
        frame has rows.

        """
        # extract lat/lon from data frame
        lat = data['WLATITUDE'].values
        lon = data['WLONGITUDE'].values

        # chunk is a tuple (start_idx, end_idx, mode)
        # go through each CAR, BUS and TRAIN chunk
        for start_idx, end_idx, _ in filter(
                lambda chunk: chunk[2] in [MODE_CAR, MODE_BUS, MODE_TRAIN],
                chunks(modes, include_values=True)):
            # test for distance first
            lat_seg = lat[start_idx:end_idx]
            lon_seg = lon[start_idx:end_idx]
            valid_lat_seg = lat_seg[np.where(np.invert(np.isnan(lat_seg)))[0]]
            valid_lon_seg = lon_seg[np.where(np.invert(np.isnan(lon_seg)))[0]]

            if len(valid_lon_seg) == 0:
                continue
            # TODO: parameters have to be tuned carefully
            is_train = predict_mode_by_location(valid_lat_seg,
                                                valid_lon_seg,
                                                self.train_location_tree,
                                                self.train_location_dict,
                                                self.train_route_dict,
                                                dist_thre=400,
                                                dist_pass_thres=7,
                                                num_stops_thre=3,
                                                dist_pass_thres_perc=0.7)

            # check whether the entry and exit points are close to any stations
            # if both are true then this segment is considered as TRAIN as well
            entry_pt_near = -1
            exit_pt_near = -1
            if start_idx - 1 >= 0:
                if not np.isnan(lat[start_idx - 1]):
                    nearest_station = find_nearest_station(
                        lat[start_idx - 1], lon[start_idx - 1],
                        self.train_location_tree, self.dist_thres_entry_exit)
                    if len(nearest_station) != 0:
                        entry_pt_near = 1
                    else:
                        entry_pt_near = 0

            if end_idx < len(modes):
                if not np.isnan(lat[end_idx]):
                    nearest_station = find_nearest_station(
                        lat[end_idx], lon[end_idx], self.train_location_tree,
                        self.dist_thres_entry_exit)
                    if len(nearest_station) != 0:
                        exit_pt_near = 1
                    else:
                        exit_pt_near = 0

            if is_train or entry_pt_near + exit_pt_near == 2:
                modes[start_idx:end_idx] = MODE_TRAIN
            else:
                modes[start_idx:end_idx] = MODE_CAR
        return modes

Esempio n. 11

0

Mostra file

File: affiliation_database.py Progetto: NJU-SaltyFishes/schedule_task

def update_affiliation_database_job():
    affiliation = AffiliationLoader()
    affiliation.get_affiliation_data()
    related_article_list = sorted(affiliation.related_article_dict.items(),
                                  key=lambda x: x[0],
                                  reverse=False)
    affiliation_info_list = []
    sql = '''
                        SELECT aff.name,AVG(art.citation_count),SUM(art.citation_count),
                        COUNT(art.id),MIN(YEAR(art.date)),MAX(YEAR(art.date)),
                        COUNT(art.pdf_link),AVG(art.total_usage-art.citation_count)
                        FROM article art,affiliation aff
                        WHERE art.id IN %s
                        AND aff.id = %s
                '''
    back_up_sql = '''
                        SELECT aff.name
                        FROM affiliation aff
                        WHERE aff.id = %s
                '''
    update_sql = '''
                        INSERT INTO affiliation_info
                        (affiliation_id,affiliation_name,average_citation_per_article,
                        citation_count,publication_count,start_year,end_year,
                        available_download,average_download_per_article,
                        create_time,update_time)
                        VALUES 
                        (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                        ON DUPLICATE KEY
                        UPDATE affiliation_name = VALUES (affiliation_name),
                        average_citation_per_article = VALUES (average_citation_per_article),
                        citation_count = VALUES (citation_count),
                        publication_count = VALUES(publication_count),
                        start_year = VALUES (start_year),
                        end_year = VALUES (end_year),
                        available_download = VALUES (available_download),
                        average_download_per_article = VALUES (average_download_per_article),
                        update_time = VALUES (update_time)
                '''
    for affiliations_articles in chunks(related_article_list, 500):
        for affiliation_articles in affiliations_articles:
            affiliation_id = affiliation_articles[0]
            articles = affiliation_articles[1]
            update_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            if not articles or len(articles) == 0:
                Cursor.execute(back_up_sql, (affiliation_id, ))
                affiliation_name = raw_result[0]
                affiliation_info_list.append(
                    (affiliation_id, affiliation_name, 0.0, 0, 0, -1, -1, 0,
                     0.0, update_time, update_time))
                continue
            Cursor.execute(sql, (
                articles,
                affiliation_id,
            ))
            raw_result = list(Cursor.fetchone())
            if raw_result is None:
                continue
            affiliation_name = raw_result[0]
            average_citation_per_article = float(
                str(raw_result[1].quantize(Decimal('0.00'))))
            citation_count = int(str(raw_result[2]))
            publication_count = raw_result[3]
            start_year = raw_result[4]
            end_year = raw_result[5]
            available_download = raw_result[6]
            average_download_per_article = float(
                str(raw_result[7].quantize(Decimal('0.00'))))
            affiliation_info_list.append(
                (affiliation_id, affiliation_name,
                 average_citation_per_article, citation_count,
                 publication_count, start_year, end_year, available_download,
                 average_download_per_article, update_time, update_time))

    print("{} affiliation_info_list_len: {}".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
        len(affiliation_info_list)))

    for affiliation_infos in chunks(affiliation_info_list, 500):
        try:
            Cursor.executemany(update_sql, affiliation_infos)
            Connection.commit()
        except Exception as e:
            print(e)
            Connection.rollback()
        time.sleep(1)
    print("{} update_affiliation_database_job finished".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

Esempio n. 12

0

Mostra file

File: tripParse.py Progetto: iamqingmei/NSE_Deep_Learning

def segFind(data_frame, daily_summary, mode_thresh, isAM, dist_lim, max_mode, max_walk):
    # function used to get mode segments out from a trip data frame and obtain the mode and dist

    # definition of mode code
    MODE_WALK_IN = 3;
    MODE_WALK_OUT = 2;
    MODE_STOP_IN = 1;
    MODE_STOP_OUT = 0;

    # thresholds for calculating distance of mode segs
    ALL_WALK_TIME=15*60   # time shorter than which the distance of walk mode segment is calculated using all points
    real_to_jump_dist = 2;  # for short walking seg, limit the distance by 2 times the jump distance

    pred_modes = data_frame[['CCMODE']].values[:,0] # take out the predicted modes, copy of the data_frame column
    # change all STOP_IN, STOP_OUT, WALK_OUT to WALK_IN
    pred_modes[(pred_modes==MODE_STOP_IN) | (pred_modes==MODE_STOP_OUT) | (pred_modes==MODE_WALK_OUT)] = MODE_WALK_IN
    mode_segs = list(chunks(pred_modes,True)) # take the mode chunks
    num_valid_mode_seg = 0
    prev_mode = 0

    # print pred_modes
    # print mode_segs
    # print data_frame['DISTANCE_DELTA'].values.tolist()
    # print data_frame['TIME_DELTA'].values.tolist()

    # go through each mode chunk
    for mode_seg in mode_segs:
        time_span = np.sum(data_frame['TIME_DELTA'].values[mode_seg[0]:mode_seg[1]])

        # abandon if the total segment time is less than threshold, and shorten the list down to 5 mode segments at most
        if time_span < mode_thresh or num_valid_mode_seg > max_mode-1:
            continue
        else:
            latlon_start = [data_frame['WLATITUDE'].values[mode_seg[0]],data_frame['WLONGITUDE'].values[mode_seg[0]]]
            latlon_end = [data_frame['WLATITUDE'].values[mode_seg[1]-1],data_frame['WLONGITUDE'].values[mode_seg[1]-1]]
            jump_dist = great_circle_dist(latlon_start,latlon_end,'meters')
            num_valid_mode_seg += 1
            if isAM:
                mode_key = 'am_mode'
                dist_key = 'am_distance'
                dura_key = 'am_duration'
            else:
                mode_key = 'pm_mode'
                dist_key = 'pm_distance'
                dura_key = 'pm_duration'

            # calculate the distance of this mode segment
            if int(mode_seg[2]) == MODE_WALK_IN:
                modes_cur_seg = data_frame['CCMODE'].values[mode_seg[0]:mode_seg[1]]
                dist_cur_seg = data_frame['DISTANCE_DELTA'].values[mode_seg[0]:mode_seg[1]]
                if time_span<ALL_WALK_TIME:
                    # if the time span is too small, consider all 0-3 modes as walking
                    dist_seg = np.nansum(dist_cur_seg)
                    dist_seg = checkLim_round(dist_seg,jump_dist*real_to_jump_dist)
                else:
                    # else if the time span is not too small, only consider 2 and 3 modes
                    dist_seg = np.nansum(dist_cur_seg[np.where((modes_cur_seg==MODE_WALK_IN) | (modes_cur_seg==MODE_WALK_OUT))[0]])
                    dist_seg = checkLim_round(dist_seg,max_walk*1000)
            else:
                dist_seg = np.nansum(data_frame['DISTANCE_DELTA'].values[mode_seg[0]:mode_seg[1]])

            if dist_seg==0 or np.isnan(dist_seg):
                # filter out the zero or nan values of dist_seg
                continue

            if mode_seg[2]==prev_mode:
                # if the current mode is same to the previous one, combine the two distance
                prev_dist = daily_summary[dist_key][len(daily_summary[dist_key])-1]
                cur_dist = checkLim_round((prev_dist*1000+dist_seg) / 1000,dist_lim)
                daily_summary[dist_key][len(daily_summary[dist_key])-1]=float(np.round(cur_dist,4))

                prev_dura = daily_summary[dura_key][len(daily_summary[dura_key])-1]
                cur_dura = prev_dura+time_span
                daily_summary[dura_key][len(daily_summary[dura_key])-1]=int(cur_dura)
                continue

            daily_summary[mode_key].append(int(mode_seg[2])) # append the mode
            daily_summary[dist_key].append(checkLim_round(dist_seg / 1000,dist_lim))
            daily_summary[dura_key].append(int(time_span))
            prev_mode = mode_seg[2]

    return num_valid_mode_seg

Esempio n. 13

0

Mostra file

File: tripParse.py Progetto: iamqingmei/NSE_Deep_Learning

def label_pts_by_pois(pois_latlon_comb,pois_label_temp,data_frame,home_cover_radius,sch_cover_radius,poi_cover_radius,poi_min_dwell_time):
    """ label all points to detected POIs
        Input:
        pois_latlon_comb: combined pois, but not chronological
        pois_label_temp: temporary labels for each POI in the given list, -2 for home, -3 for school , -4 to -n for others
        data_frame: pandas data frame of all one day data of the device
        Output:
        pois_latlon_chro: chronological pois, can be duplicated, like: [home, school, home]
        pois_label_chro: chronological poi labels, -2 for home, -3 for school, 1 to n for other POIs
        data_frame['POI_LABEL']: label of each point indicating which POI it belongs to, -1 for none, -2 for home... [same as above]
    """

    # list of lat/lon and timestamp of all points across the day
    latlon_all = data_frame[['WLATITUDE','WLONGITUDE']].values.tolist()
    ts_all = data_frame['TIMESTAMP'].values
    sgt_all = data_frame['TIME_SGT'].values
    lat_all = data_frame['WLATITUDE'].values
    delta_dist_all = data_frame['DISTANCE_DELTA'].values
    # pt-level label indicating which poi this point belongs to
    # initialize the labels with -1
    poi_label_pt = np.array([-1]*len(latlon_all)) 

    # go through each POI, label all points
    for idx,poi in enumerate(pois_latlon_comb):
        dist2poi_list = map(lambda x: great_circle_dist(x, [poi[0],poi[1]], unit="meters"), latlon_all)
        dist2poi_array = np.array(dist2poi_list)
        if pois_label_temp[idx] == -2:
            # if the poi is home
            poi_label_pt[dist2poi_array<=home_cover_radius] = -2
        elif pois_label_temp[idx] == -3:
            # if the poi is school
            poi_label_pt[dist2poi_array<=sch_cover_radius] = -3
        else:
            # if it's other pois
            poi_label_pt[dist2poi_array<=poi_cover_radius] = pois_label_temp[idx]

    # wipe out short noise between two adjacent same pois
    poi_label_chunks = chunks_real(poi_label_pt,include_values=True)
    num_chunks =  len(poi_label_chunks)
    if num_chunks>1:
        for idx,label_chunk in enumerate(poi_label_chunks):
            if label_chunk[2]==-1:
                # go through all noise chunks
                lat_cur = lat_all[label_chunk[0]:label_chunk[1]]
                if idx == 0:
                    # if noise chunk is in the beginning
                    if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<60*5) or \
                    (float(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])/(label_chunk[1]-label_chunk[0])>1000) \
                    or (len(lat_cur[np.isnan(lat_cur)])*1.0/len(lat_cur)>0.9):
                        # if the chunk is small, or average delta time is big, or most points have invalid location
                        poi_label_pt[label_chunk[0]:label_chunk[1]]=poi_label_chunks[idx+1][2]
                elif idx == num_chunks-1:
                    # if noise chunk is in the end
                    if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<60*5) or \
                    (float(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])/(label_chunk[1]-label_chunk[0])>1000) \
                    or (len(lat_cur[np.isnan(lat_cur)])*1.0/len(lat_cur)>0.9):
                        # if the chunk is small, or average delta time is big, or most points have invalid location
                        poi_label_pt[label_chunk[0]:label_chunk[1]]=poi_label_chunks[idx-1][2]
                elif (poi_label_chunks[idx-1][2]==poi_label_chunks[idx+1][2]):
                    # if the former and latter chunks have same labels and this noise chunk is short in duration
                    # or most points have invalid loc, or most of the points are sleeping, or the average velocity is small
                    # set labels of this noise chunk as the same label
                    if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<60*20) or (len(lat_cur[np.isnan(lat_cur)])*1.0/len(lat_cur)>0.9) \
                        or (np.nansum(delta_dist_all[label_chunk[0]:label_chunk[1]])/(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])<1.0) \
                        or (float(ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]])/(label_chunk[1]-label_chunk[0])>1000):
                        # logging.debug("noise removed")
                        poi_label_pt[label_chunk[0]:label_chunk[1]]=poi_label_chunks[idx-1][2]

    # obtain the pois_latlon_chro and pois_label_chro
    pois_latlon_comb = np.array(pois_latlon_comb)
    pois_label_temp = np.array(pois_label_temp)
    pois_latlon_chro = []
    pois_label_chro = []
    pois_start_idx = []
    pois_start_sgt = []
    pois_end_idx = []
    pois_end_sgt = []
    num_normal_poi = 1
    poi_label_chunks = chunks(poi_label_pt,include_values=True)
    for label_chunk in poi_label_chunks:
        # go through all poi chunks chronologically
        if label_chunk[2] == -1:
            # non poi chunk
            continue
        else:
            # poi chunk
            if (ts_all[label_chunk[1]-1]-ts_all[label_chunk[0]]<poi_min_dwell_time):
                # if the poi chunk is too short, remove it
                poi_label_pt[label_chunk[0]:label_chunk[1]]=-1
            else:
                if label_chunk[2]==-2 or label_chunk[2]==-3:
                    # home or school chunk
                    pois_latlon_chro.append(pois_latlon_comb[pois_label_temp==label_chunk[2],:].tolist()[0])
                    pois_label_chro.append(label_chunk[2])
                else:
                    # for normal poi, just count from 1 to n
                    pois_latlon_chro.append(pois_latlon_comb[pois_label_temp==label_chunk[2],:].tolist()[0])
                    pois_label_chro.append(num_normal_poi)
                    poi_label_pt[label_chunk[0]:label_chunk[1]] = num_normal_poi
                    num_normal_poi += 1
                pois_start_idx.append(label_chunk[0])
                pois_start_sgt.append(sgt_all[label_chunk[0]])
                pois_end_idx.append(label_chunk[1]-1)
                pois_end_sgt.append(sgt_all[label_chunk[1]-1])
                # if there are more than two points with valid location for this poi, then remove the first 
                # and last point with valid location out
                lat_cur_poi = lat_all[label_chunk[0]:label_chunk[1]]
                idx_valid_loc = np.where(~np.isnan(lat_cur_poi))[0]
                if len(idx_valid_loc)>2:
                    if label_chunk[0]!=0: # to avoid creating invalid trips in the beginning of the day
                        poi_label_pt[label_chunk[0]:label_chunk[0]+idx_valid_loc[0]+1]=-1
                    if label_chunk[1]!=len(lat_all): # to avoid creating invalid trips in the end of the day
                        poi_label_pt[label_chunk[0]+idx_valid_loc[-1]:label_chunk[1]]=-1

    data_frame['POI_LABEL'] = pd.Series(poi_label_pt)
    pois_dict = {'pois_latlon_chro':pois_latlon_chro, 'pois_label_chro': pois_label_chro, 'pois_start_idx': pois_start_idx, \
    'pois_end_idx': pois_end_idx, 'pois_start_sgt': pois_start_sgt, 'pois_end_sgt': pois_end_sgt}
    return pois_dict

Esempio n. 14

0

Mostra file

File: evaluation.py Progetto: iamqingmei/NSE_Deep_Learning

def evaluate_overall_bibibinary(vehicle_or_not_model,
                                bus_or_not_model,
                                mrt_or_car_model,
                                features_test,
                                labels_test,
                                vehicle_or_not_idx,
                                bus_or_not_idx,
                                mrt_or_car_idx,
                                if_smooth=True):
    write = "**********Evaluate Overall Result**********\n"
    write += "with 4 labels\n"
    vehicle_or_not_result = vehicle_or_not_model.predict(
        np.array(features_test.iloc[:, vehicle_or_not_idx]))
    if len(np.shape(vehicle_or_not_result)) > 1:
        vehicle_or_not_result = np.argmax(vehicle_or_not_result, 1)

    bus_or_not_result = bus_or_not_model.predict(
        np.array(features_test.iloc[:, bus_or_not_idx]))
    if len(np.shape(bus_or_not_result)) > 1:
        bus_or_not_result = np.argmax(bus_or_not_result, 1)

    mrt_or_car_result = mrt_or_car_model.predict(
        np.array(features_test.iloc[:, mrt_or_car_idx]))
    if len(np.shape(mrt_or_car_result)) > 1:
        mrt_or_car_result = np.argmax(mrt_or_car_result, 1)

    result_label = []
    trip_chunks = list(chunks(features_test['trip_id'].tolist()))
    if if_smooth is True:
        for trip_chunk in trip_chunks:
            vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]] = \
                smooth_is_vehicle(features_test.iloc[trip_chunk[0]:trip_chunk[1]],
                                  vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]])

    for idx, t in enumerate(vehicle_or_not_result):
        if t == 0:  # stationary or stop
            result_label.append(5)
        elif t == 1:  # vehicle
            if bus_or_not_result[idx] == 0:
                if mrt_or_car_result[idx] == 0:
                    result_label.append(2)  # mrt
                elif mrt_or_car_result[idx] == 1:
                    result_label.append(4)  # car
            elif bus_or_not_result[idx] == 1:
                result_label.append(3)  # bus
            else:
                print("Error in overall evaluation: wrong label! at idx: %d" %
                      idx)
        else:  # t[1] == 2
            print("Error in overall evaluation: wrong label! at idx %d, %d" %
                  (idx, t))

    write += str(Counter(labels_test)) + '\n'
    con_matrix = confusion_matrix(labels_test, result_label)
    acc = accuracy_score(labels_test, result_label)
    write += str(con_matrix) + '\n'
    write += "Classification report:\n"
    write += str(classification_report(labels_test, result_label)) + '\n'

    evaluation_report.add_content(write)
    evaluation_report.add_accuracy(acc)

    del write, vehicle_or_not_result, bus_or_not_result, mrt_or_car_result, trip_chunks, con_matrix, acc

    return result_label

Esempio n. 15

0

Mostra file

File: evaluation.py Progetto: iamqingmei/NSE_Deep_Learning

def evaluate_overall_lstm(vehicle_or_not_model,
                          vehicle_type_model,
                          features_test,
                          labels_test,
                          vehicle_or_not_idx,
                          vehicle_type_idx,
                          if_smooth=True):
    write = "**********Evaluate Overall Result**********\n"
    write += "Using manual labelled data, with 4 labels\n"
    vehicle_or_not_test = np.reshape(
        np.array(features_test.iloc[:, vehicle_or_not_idx]),
        (len(features_test), 6, int(len(vehicle_or_not_idx) / 6)))
    vehicle_or_not_result = vehicle_or_not_model.predict(vehicle_or_not_test)
    if len(np.shape(vehicle_or_not_result)) > 1:
        vehicle_or_not_result = np.argmax(vehicle_or_not_result, 1)

    vehicle_type_test = np.reshape(
        np.array(features_test.iloc[:, vehicle_type_idx]),
        (len(features_test), 6, int(len(vehicle_type_idx) / 6)))
    vehicle_type_result = vehicle_type_model.predict(vehicle_type_test)
    if len(np.shape(vehicle_type_result)) > 1:
        vehicle_type_result = np.argmax(vehicle_type_result, 1)

    result_label = []
    trip_chunks = list(chunks(features_test['trip_id'].tolist()))
    if if_smooth is True:
        write += "Smoooooooothing vehicle_or_not_result~~~~~~~~\n"
        for trip_chunk in trip_chunks:
            vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]] = \
                smooth_is_vehicle(features_test.iloc[trip_chunk[0]:trip_chunk[1]],
                                  vehicle_or_not_result[trip_chunk[0]:trip_chunk[1]])

    # is_vehicle_smoothing()
    for idx, t in enumerate(vehicle_or_not_result):
        if t == 0:  # stationary or stop
            result_label.append(5)
        elif t == 1:  # vehicle
            if vehicle_type_result[idx] == 0:
                result_label.append(2)  # mrt
            elif vehicle_type_result[idx] == 1:
                result_label.append(3)  # bus
            elif vehicle_type_result[idx] == 2:
                result_label.append(4)  # car
            else:
                print("Error in overall evaluation: wrong label!" +
                      vehicle_type_model[idx])
        else:  # t[1] == 2
            print("Error in overall evaluation: wrong label! at idx %d, %d" %
                  (idx, t))
    if if_smooth is True:
        write += "Smoooooooothing smooth_vehicle_type~~~~~~~~\n"
        for trip_chunk in trip_chunks:
            result_label[trip_chunk[0]:trip_chunk[1]] = \
                smooth_vehicle_type(features_test.iloc[trip_chunk[0]:trip_chunk[1]],
                                    result_label[trip_chunk[0]:trip_chunk[1]])

    write += str(Counter(labels_test)) + '\n'
    con_matrix = confusion_matrix(labels_test, result_label)
    acc = accuracy_score(labels_test, result_label)
    write += str(con_matrix) + '\n'
    write += "Classification report:\n"
    write += str(classification_report(labels_test, result_label)) + '\n'

    evaluation_report.add_content(write)
    evaluation_report.add_accuracy(acc)

    del write, vehicle_or_not_test, vehicle_or_not_result, vehicle_type_test, vehicle_type_result, trip_chunks, \
        con_matrix, acc

    return result_label

Esempio n. 16

0

Mostra file

File: feature_calc.py Progetto: iamqingmei/NSE_Deep_Learning

def calc_geo_time_features(data_frame,
                           queried_date_str,
                           window_size,
                           high_velocity_thresh=40):
    """
    Calculate additional features and attributes from the raw hardware
    data. New attributes are added as new columns in the data frame in
    place.
    Additional features included: ANALYZED_DATE, TIME_DELTA, STEPS, STEPS_DELTA, DISTANCE_DELTA, VELOCITY, ACCELERATION,
    MOV_AVE_VELOCITY, MOV_AVE_ACCELERATION
    :param data_frame: The original dataframe, including those raw features
    :param queried_date_str: Analyzed data
    :param window_size: Window size of sliding window
    :param high_velocity_thresh: A threshold to determine whether the velocity is too high, unit : m/s
    :return: The status of success of feature calculation
    """

    # add analyzed date into the data frame
    data_frame['ANALYZED_DATE'] = pd.Series([queried_date_str] *
                                            len(data_frame))
    # calculate the SGT time of the day, in hours
    time_SGT = map(lambda x: get_hour_SGT(x), data_frame['TIMESTAMP'].values)
    data_frame['TIME_SGT'] = pd.Series(time_SGT)

    # calculate time delta since the last measurement, in seconds
    a = np.array(data_frame.iloc[:-1]['TIMESTAMP'])
    b = np.array(data_frame.iloc[1:]['TIMESTAMP'])
    delta_timestamps = list(b - a)
    if data_frame['TIME_SGT'][0] < 1.5:
        # add dt to 24 am for the first measurement when first point is within 24 am to 1.5am
        delta_timestamps = [int(data_frame['TIME_SGT'][0] * 3600)
                            ] + delta_timestamps
    else:
        # add a zero value for the first measurement when first point is not from 24 am to 1.5am
        delta_timestamps = [0] + delta_timestamps
    data_frame['TIME_DELTA'] = pd.Series(delta_timestamps)
    # check if there's negative delta_t
    ts_array = np.array(delta_timestamps)
    if any(ts_array < 0):
        logging.error("There's negative delta_t from DB!!! Length is: " +
                      str(sum(ts_array < 0)))
        return False

    # calculate steps delta since the last measurement
    consec_steps = zip(data_frame[['STEPS']].values[:-1],
                       data_frame[['STEPS']].values[1:])
    delta_steps = map(lambda x: x[1][0] - x[0][0], consec_steps)
    # filter out negative delta_steps
    delta_steps = [dstep if dstep >= 0 else 0 for dstep in delta_steps]
    # add a zero value for the first measurement where no delta is available
    data_frame['STEPS_DELTA'] = pd.Series([0] + delta_steps)

    # select rows in data frame that have valid locations
    df_validloc = data_frame.loc[~np.isnan(data_frame['WLATITUDE'])
                                 & ~np.isnan(data_frame['WLONGITUDE'])]
    # calculate distance delta from pairs of valid lat/lon locations that follow each other
    valid_latlon = df_validloc[['WLATITUDE', 'WLONGITUDE']].values
    dist_delta = list(
        map(
            lambda loc_pair: great_circle_dist(
                loc_pair[0], loc_pair[1], unit="meters"),
            zip(valid_latlon[:-1], valid_latlon[1:])))
    # calculate time delta from pairs of valid timestamps
    valid_times = df_validloc['TIMESTAMP'].values
    time_delta = valid_times[1:] - valid_times[:-1]
    # calculate velocity, m/s
    velocity = dist_delta / time_delta

    # create new columns for delta distance, time delta and velocity, initialzied with NaN
    data_frame['DISTANCE_DELTA'] = pd.Series(dist_delta, df_validloc.index[1:])
    data_frame['VELOCITY'] = pd.Series(
        velocity, df_validloc.index[1:])  # velocity in m/s
    data_frame['ACCELERATION'] = data_frame['VELOCITY'] / data_frame[
        'TIME_DELTA']  # acceleration in m/s^2

    # assign the velocity of those nan-loc points with the latter first valid velocity
    validloc_label = np.isnan(
        data_frame['WLATITUDE'].values)  # True for points with nan loc
    validloc_label_chunks = chunks(validloc_label, include_values=True)
    for label_chunk in validloc_label_chunks:
        # find True chunks (no loc) and assign the velocity
        if label_chunk[2] and label_chunk[1] != len(data_frame):
            data_frame.loc[data_frame.index[0] + label_chunk[0]:data_frame.index[0] + label_chunk[1] - 1, 'VELOCITY'] = \
                data_frame['VELOCITY'][label_chunk[1]]

    # replace very high velocity values which are due to wifi
    # localizations errors with NaN in VELOCITY column
    idx_too_high = np.where(
        data_frame['VELOCITY'].values > high_velocity_thresh)[0].tolist()
    idx_too_high = [item + data_frame.index[0] for item in idx_too_high]
    idx_bef_too_high = (np.array(idx_too_high) - 1).tolist()
    data_frame.loc[
        idx_too_high,
        ['WLATITUDE', 'WLONGITUDE', 'DISTANCE_DELTA', 'VELOCITY']] = np.nan
    data_frame.loc[
        idx_bef_too_high,
        ['WLATITUDE', 'WLONGITUDE', 'DISTANCE_DELTA', 'VELOCITY']] = np.nan

    # calculate the moving average of velocity, m/s
    LARGE_TIME_JUMP = 60  # seconds
    velocity_all = data_frame['VELOCITY'].values
    moving_ave_velocity_all = moving_ave_velocity(velocity_all,
                                                  np.array(delta_timestamps),
                                                  LARGE_TIME_JUMP, window_size)
    moving_ave_acc_all = moving_ave_velocity(data_frame['ACCELERATION'].values,
                                             np.array(delta_timestamps),
                                             LARGE_TIME_JUMP, window_size)
    data_frame['MOV_AVE_VELOCITY'] = pd.Series(
        moving_ave_velocity_all)  # velocity in m/s
    data_frame['MOV_AVE_ACCELERATION'] = pd.Series(
        moving_ave_acc_all)  # acceleration in m/s^2

    return True