Ejemplo n.º 1
0
    def __getLatest(self, dt=None):
        print(f'{self.dateFrom} - {self.dateTo} Grab latest data', dt)
        if not dt: dt = hp.GetTimeRanges(self.dateFrom, self.dateTo)
        allNodesDf = qrs.allTestedNodes(dt)
        allNodesDf = allNodesDf[(allNodesDf['ip'] != '')
                                & ~(allNodesDf['ip'].isnull())]
        # in some cases there is one IP having 2 different hostanames
        allNodesDf = self.__removeDuplicates(allNodesDf)
        rows = []
        # run a query for each ip because it is not a trivial task (if possible at all) to aggreagate the geolocation fields
        for item in allNodesDf.sort_values('host',
                                           ascending=False).to_dict('records'):
            lastRec = qrs.mostRecentMetaRecord(item['ip'], item['ipv6'], dt)
            if len(lastRec) > 0:
                rows.append(lastRec)
            else:
                item['site_index'] = item['site']
                rows.append(item)

        columns = [
            'ip', 'timestamp', 'host', 'site', 'administrator', 'email', 'lat',
            'lon', 'site_meta', 'site_index'
        ]
        df = pd.DataFrame(rows, columns=columns)

        df = df.drop_duplicates()

        df['last_update'] = df['timestamp'].apply(
            lambda ts: self.convertTime(ts))
        df['last_update'].fillna(self.convertTime(dt[1]), inplace=True)

        return df
Ejemplo n.º 2
0
def runInParallel(dateFrom, dateTo):
    # query the past 12 hours and split the period into 8 time ranges
    # dateFrom, dateTo = hp.defaultTimeRange(12)
    # dateFrom, dateTo = ['2022-05-17 20:15', '2022-05-18 08:15']
    print(f' Run for period: {dateFrom}  -   {dateTo}')
    dtList = hp.GetTimeRanges(dateFrom, dateTo, 12)
    with ProcessPoolExecutor(max_workers=4) as pool:
        result = pool.map(getTraceData, [[dtList[i], dtList[i+1]] for i in range(len(dtList)-1)])
Ejemplo n.º 3
0
    def queryData(self, idx):
        data = []
        intv = int(hp.CalcMinutes4Period(self.dateFrom, self.dateTo) / 60)
        time_list = hp.GetTimeRanges(self.dateFrom, self.dateTo, intv)
        for i in range(len(time_list) - 1):
            data.extend(qrs.query4Avg(idx, time_list[i], time_list[i + 1]))

        return data
Ejemplo n.º 4
0
def queryData(dateFrom, dateTo):
    data = []
    # query in portions since ES does not allow aggregations with more than 10000 bins
    intv = int(hp.CalcMinutes4Period(dateFrom, dateTo) / 60)
    time_list = hp.GetTimeRanges(dateFrom, dateTo, intv)
    for i in range(len(time_list) - 1):
        data.extend(query4Avg(time_list[i], time_list[i + 1]))

    return data
Ejemplo n.º 5
0
def loadPacketLossData(dateFrom, dateTo):
    data = []
    intv = int(hp.CalcMinutes4Period(dateFrom, dateTo) / 60)
    time_list = hp.GetTimeRanges(dateFrom, dateTo, intv)
    for i in range(len(time_list) - 1):
        data.extend(
            qrs.query4Avg('ps_packetloss', time_list[i], time_list[i + 1]))

    return pd.DataFrame(data)
Ejemplo n.º 6
0
    def InOutDf(self, idx, idx_df):
        print(idx)
        in_out_values = []
        time_list = hp.GetTimeRanges(self.dateFrom, self.dateTo)
        for t in ['dest_host', 'src_host']:
            meta_df = idx_df.copy()

            df = pd.DataFrame(
                qrs.queryDailyAvg(idx, t, time_list[0],
                                  time_list[1])).reset_index()

            df['index'] = pd.to_datetime(df['index'],
                                         unit='ms').dt.strftime('%d/%m')
            df = df.transpose()
            header = df.iloc[0]
            df = df[1:]

            df.columns = ['day-3', 'day-2', 'day-1', 'day']

            meta_df = pd.merge(meta_df, df, left_on="host", right_index=True)

            three_days_ago = meta_df.groupby('site').agg(
                {
                    'day-3': lambda x: x.mean(skipna=False)
                }, axis=1).reset_index()
            two_days_ago = meta_df.groupby('site').agg(
                {
                    'day-2': lambda x: x.mean(skipna=False)
                }, axis=1).reset_index()
            one_day_ago = meta_df.groupby('site').agg(
                {
                    'day-1': lambda x: x.mean(skipna=False)
                }, axis=1).reset_index()
            today = meta_df.groupby('site').agg(
                {
                    'day': lambda x: x.mean(skipna=False)
                }, axis=1).reset_index()

            site_avg_df = reduce(
                lambda x, y: pd.merge(x, y, on='site', how='outer'),
                [three_days_ago, two_days_ago, one_day_ago, today])
            site_avg_df.set_index('site', inplace=True)
            change = site_avg_df.pct_change(axis='columns')
            site_avg_df = pd.merge(site_avg_df,
                                   change,
                                   left_index=True,
                                   right_index=True,
                                   suffixes=('_val', ''))
            site_avg_df['direction'] = 'IN' if t == 'dest_host' else 'OUT'

            in_out_values.append(site_avg_df)

        site_df = pd.concat(in_out_values).reset_index()
        site_df = site_df.round(2)

        return {"data": site_df, "dates": header}
Ejemplo n.º 7
0
    def getData(self, src, dest):
        time_list = hp.GetTimeRanges(self.root_parent.dateFrom,
                                     self.root_parent.dateTo)

        df = pd.DataFrame(qrs.queryAllValues(self._idx, src, dest, time_list))
        df.rename(columns={hp.getValueField(self._idx): 'value'}, inplace=True)
        if len(df) > 0:
            df['log_value'] = np.log(df['value'].replace(0, np.nan))
            df['sqrt'] = df['value']**(1 / 2)
        return df
Ejemplo n.º 8
0
def queryNodesGeoLocation():

    include=["geolocation","external_address.ipv4_address", "external_address.ipv6_address", "config.site_name", "host"]
    period = hp.GetTimeRanges(*hp.defaultTimeRange(days=30))

    query = {
            "query": {
                "bool": {
                        "filter": [
                        {
                            "range": {
                                "timestamp": {
                                "gte": period[0],
                                "lte": period[1]
                                }
                            }
                        }
                    ]
                }
            }
          }
    data = scan(client=hp.es, index='ps_meta', query=query,
                _source=include, filter_path=['_scroll_id', '_shards', 'hits.hits._source'])

    count = 0
    neatdata = []
    ddict = {}
    for res in data:
        if not count%100000: print(count)
        data = res['_source']

        if 'config' in data:
            site = data['config']['site_name']
        else: site = None

        if 'ipv4_address' in  data['external_address']:
            ip = data['external_address']['ipv4_address']
        else: ip = data['external_address']['ipv6_address']

        geoip = [None, None]
        if 'geolocation' in data:
            geoip = data['geolocation'].split(",")

#         if 'speed' in  data['external_address']:
#             speed = data['external_address']['speed']

        if (ip in ddict) and (site is not None):
            ddict[ip]['site'] = site
        else: ddict[ip] = {'lat': geoip[0], 'lon': geoip[1], 'site': site, 'host':data['host']}

        count=count+1
    return ddict
Ejemplo n.º 9
0
    def __updateDataset(self):

        self.metaDf = self.getMetafromES()

        if len(self.metaDf) > 1:
            print('Update meta data')
            self.metaDf = self.__updateMetaData(self.metaDf,
                                                self.__getLatest())
        else:
            # Initially, grab one year of data split into 6 chunks in order to
            # fill in info that may not appear in the most recent data
            print('No data found. Query a year back.')
            dateTo = datetime.strftime(self.now, '%Y-%m-%d %H:%M')
            dateFrom = datetime.strftime(self.now - timedelta(days=365),
                                         '%Y-%m-%d %H:%M')
            timeRange = hp.GetTimeRanges(dateFrom, dateTo, 10)

            self.metaDf = self.__getLatest([timeRange[0], timeRange[1]])
            for i in range(2, len(timeRange) - 1):
                print(
                    f'Period: {timeRange[i]}, {timeRange[i+1]}, data size before update: {len(self.metaDf)}'
                )
                self.metaDf = self.__updateMetaData(
                    self.metaDf,
                    self.__getLatest([timeRange[i], timeRange[i + 1]]))
                print(f'Size after update: {len(self.metaDf)}')
                print()

            # self.metaDf.loc[self.metaDf.site == 'CERN-PROD', 'lat'] = 46.2416566
            # self.metaDf.loc[self.metaDf.site == 'CERN-PROD', 'lon'] = 6.0468415

        # Finally, try to fix empty fields by searching for similar host names and assign their value
        try:
            self.metaDf = self.__fixMissingSites(self.metaDf)
            self.metaDf = self.metaDf.fillna(
                self.metaDf[['site', 'lat', 'lon']].groupby('site').ffill())
            self.metaDf = self.metaDf.drop_duplicates(subset='ip', keep="last")

            # remove all >1500 nodes for which there is no meaningful info
            self.metaDf.fillna('', inplace=True)
            toRemoveIds = self.metaDf[~(self.metaDf['lat'].astype(bool))&~(self.metaDf['lon'].astype(bool))\
                                &~(self.metaDf['administrator'].astype(bool))&~(self.metaDf['site_index'].astype(bool))\
                                &~(self.metaDf['site_meta'].astype(bool))&~(self.metaDf['email'].astype(bool))].index.values
            self.metaDf = self.metaDf[~self.metaDf.index.isin(toRemoveIds)]

        except Exception as e:
            print(traceback.format_exc())
        finally:
            print('Meta data done')
Ejemplo n.º 10
0
    def getValues(self, probdf):
        #     probdf = markNodes()
        df = pd.DataFrame(columns=['timestamp', 'value', 'idx', 'hash'])
        time_list = hp.GetTimeRanges(self.dateFrom, self.dateTo)
        for item in probdf[['src', 'dest', 'idx']].values:
            tempdf = pd.DataFrame(
                qrs.queryAllValues(item[2], item, time_list[0], time_list[1]))
            tempdf['idx'] = item[2]
            tempdf['hash'] = item[0] + "-" + item[1]
            tempdf['src'] = item[0]
            tempdf['dest'] = item[1]
            tempdf.rename(columns={hp.getValueField(item[2]): 'value'},
                          inplace=True)
            df = df.append(tempdf, ignore_index=True)

        return df
Ejemplo n.º 11
0
def run(dateFrom, dateTo):
    # query the past 24 hours and split the period into 8 time ranges
    dtList = hp.GetTimeRanges(dateFrom, dateTo, 8)
    with ProcessPoolExecutor(max_workers=4) as pool:
        result = pool.map(getTraceData, [[dtList[i], dtList[i+1]] for i in range(len(dtList)-1)])