Exemple #1
1
def get_link(url):
	link_exr = re.compile(r'<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>')
	links = []
	
	# open web content
	f = urllib2.urlopen(url)
	content = f.read()
	
	# versi find html tag : find all url and save to links
	# soup = BeautifulSoup(content, "lxml")
	# for a in soup.find_all('a', href=True):
	# 	if "detik.com" in a['href']:
	# 		if "http:" not in a['href']:
	# 			a['href'] = "http:" + a['href']
	# 		print "Found the URL:", a['href']
	# 		links.append(a['href'])
			
	# versi regex : find all url and save to links			
	for link in link_exr.findall(content):
		if "detik.com" in link[0]:
			link_detik = link[0]
			if "http:" not in link_detik:
				link_detik = "http:" + link_detik
			links.append(link_detik)
	
	# save to DataFrame
	df = DataFrame(links, columns=['detik url'])
	df.drop_duplicates()

	print df.head(0)
		# create and save to sqlite database
	detik_db = create_engine("mysql://*****:*****@localhost/data_detik") 
	df.to_sql('url_detik', detik_db, if_exists='replace')
Exemple #2
0
def removeDuplicate(file):
    """Removes duplicate points based on X, Y coordinates

       Returns a numpy array"""
    df = DataFrame(np.vstack((file.x, file.y, file.z)).transpose(), columns=['X', 'Y', 'Z'])
    df.drop_duplicates(subset=['X','Y'], inplace=True)
    return df.values
Exemple #3
0
def test_drop_duplicates_tuple():
    df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
                                   'foo', 'bar', 'bar', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1, 1, 2, 2, 2, 2, 1, 2],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates(('AA', 'AB'))
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AA', 'AB'), keep='last')
    expected = df.loc[[6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AA', 'AB'), keep=False)
    expected = df.loc[[]]  # empty df
    assert len(result) == 0
    tm.assert_frame_equal(result, expected)

    # multi column
    expected = df.loc[[0, 1, 2, 3]]
    result = df.drop_duplicates((('AA', 'AB'), 'B'))
    tm.assert_frame_equal(result, expected)
def process_duplicated_entries(dfm_stk_strc:DataFrame,stockid):
    dfm_duplicated = dfm_stk_strc[dfm_stk_strc.duplicated(['变动日期'])]
    # print(dfm_duplicated)
    dfm_stk_strc.drop_duplicates('变动日期',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        # dfm_stk_strc.loc[index]['变动原因'] = dfm_stk_strc.loc[index]['变动原因'] +'|'+row['变动原因']
        dfm_stk_strc.loc[index,'变动原因'] = dfm_stk_strc.loc[index]['变动原因'] + '|' + row['变动原因']
        logprint('Stock %s 变动日期 %s 记录合并到主记录中. %s' %(stockid,row['变动日期'],tuple(row)))
Exemple #5
0
def test_duplicated_with_misspelled_column_name(subset):
    # GH 19730
    df = DataFrame({'A': [0, 0, 1],
                    'B': [0, 0, 1],
                    'C': [0, 0, 1]})

    with pytest.raises(KeyError):
        df.duplicated(subset)

    with pytest.raises(KeyError):
        df.drop_duplicates(subset)
Exemple #6
0
    def _decode_solutions(self, solutions):
        decoded_solutions = DataFrame(columns=["targets", "fitness"])
        index = 0
        for solution in solutions:
            combinations = self._decoder(solution.candidate, flat=True, decompose=True)
            for targets in combinations:
                if len(targets) > 0:
                    decoded_solutions.loc[index] = [tuple(targets), solution.fitness]
                    index += 1

        decoded_solutions.drop_duplicates(inplace=True, subset="targets")
        decoded_solutions.reset_index(inplace=True)

        return decoded_solutions
Exemple #7
0
def test_drop_duplicates_with_duplicate_column_names():
    # GH17836
    df = DataFrame([
        [1, 2, 5],
        [3, 4, 6],
        [3, 4, 7]
    ], columns=['a', 'a', 'b'])

    result0 = df.drop_duplicates()
    tm.assert_frame_equal(result0, df)

    result1 = df.drop_duplicates('a')
    expected1 = df[:2]
    tm.assert_frame_equal(result1, expected1)
Exemple #8
0
    def get_dominant_alleles( sample_ids, marker_ids = None ):

        t1 = aliased( Allele )
        t2 = aliased( Allele )
        s1 = aliased( AlleleSet )
        marker_ids = [ int(x) for x in marker_ids ]

        # I honestly forgot how the mundane thing below works !!!
        # I was smarter when I was younger 8-(
        q = dbsession.query( t1.marker_id, s1.sample_id, t1.value, t1.size, t1.height ).\
            join( s1 ).\
            outerjoin( t2, and_( t1.marker_id == t2.marker_id,
                                t1.alleleset_id == t2.alleleset_id,
                                t1.height < t2.height) ).\
            filter( t2.marker_id == None ).order_by( t1.marker_id, s1.sample_id ).\
            filter( s1.sample_id.in_( sample_ids ) ).filter( t1.marker_id.in_( marker_ids ))

        df = DataFrame( [ (marker_id, sample_id, value, size, height)
                        for marker_id, sample_id, value, size, height in q ] )

        if len(df) == 0:
            return None

        df.columns = ( 'marker_id', 'sample_id', 'value', 'size', 'height' )
        return df.drop_duplicates( ['marker_id', 'sample_id'] )
def recursive_add_consumers(consumer_id, seen = set([])):
        if consumer_id is None:
            return

        seen.add(consumer_id)
        consumer_key = sample[sample.Consumer == consumer_id]
        IP = df.drop_duplicates(df(consumer_key.IP))

        n = np.array(np.arange(len(IP)))

        IP_Map = set([])
        for i in n:
          value = sample[sample.IP.isin([IP.iloc[i,0]])]
          IP_Map.add(value)

        #print IP_Map

        print consumer_id
        print seen
        consumer_list = []

        #list of unique consumers that are linked to this one
        [consumer_list.extend(y.Consumer.iloc[l].tolist()) for l in [range(len(y.Consumer)) for y in IP_Map]]

        #print consumer_list
        #print [x for x in set(consumer_list).difference([consumer_id])]
        #unique_consumer_list = []
        #print [ x for x in set([y.Consumer.iloc[0] for y in IP_Map])]


        #tuples of ips and unique consumers attached to them
        print [(y.IP.iloc[0],set(y.Consumer.iloc[l].tolist())) for l in [range(len(y.Consumer)) for y in IP_Map]]
 def getDateTimeSeries(self,instrument=None):
     if instrument is None:
        __dateTime = DataFrame()
        for element in self.__instrument:
            __dateTime = __dateTime.append(self.__feed[element].getPriceDataSeries().getDateTimes())
        __dateTime = __dateTime.drop_duplicates([0])
        return __dateTime.values #此时返回的为二维数组
     return self.__feed[instrument].getPriceDataSeries().getDateTimes()
def process_duplicated_entries(dfm_stk_info:DataFrame,stockid):
    dfm_duplicated = dfm_stk_info[dfm_stk_info.duplicated(['股权登记日'])]
    # print(dfm_duplicated)
    dfm_stk_info.drop_duplicates('股权登记日',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        dfm_stk_info.loc[index,'分红年度'] = add_considering_None(dfm_stk_info.loc[index]['分红年度'],row['分红年度'])
        dfm_stk_info.loc[index,'分红方案'] = dfm_stk_info.loc[index]['分红方案'] + '|' + row['分红方案']
        if dfm_stk_info.loc[index]['方案文本解析错误标识位'] !='E':
            if row['方案文本解析错误标识位'] == 'E':
                dfm_stk_info.loc[index, '方案文本解析错误标识位'] = 'E'
                dfm_stk_info.loc[index, '派息(税前)(元)/10股'] = None
                dfm_stk_info.loc[index, '转增(股)/10股'] = None
                dfm_stk_info.loc[index, '送股(股)/10股'] = None
            else:
                dfm_stk_info.loc[index,'派息(税前)(元)/10股'] = add_considering_None(dfm_stk_info.loc[index]['派息(税前)(元)/10股'],row['派息(税前)(元)/10股'])
                dfm_stk_info.loc[index,'转增(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['转增(股)/10股'] , row['转增(股)/10股'])
                dfm_stk_info.loc[index,'送股(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['送股(股)/10股'] , row['送股(股)/10股'])
        logprint('Stock %s 股权登记日 %s 记录合并到主记录中. %s' %(stockid,row['股权登记日'],tuple(row)))
Exemple #12
0
 def compute_ambiguity(self):
     results = self.cursor.execute('SELECT type, entity FROM entities')
     tmp = results.fetchall()
     freq_df = DataFrame(tmp, columns=['e_type', 'entity'])
     freq_df['ambiguity'] = 10
     freq_df = freq_df.drop_duplicates()
     result_computed_location = self._compute_location_ambiguity(freq_df)
     result_computed_name = self._compute_name_ambiguity(result_computed_location)
     return result_computed_name
def IP_Weight_Calc(consumer_id):

        if consumer_id == None:
            return
        consumer_key = sample[sample.Consumer == consumer_id]

        IP = df.drop_duplicates(df(consumer_key.IP))

        n = np.array(np.arange(len(IP)))
        IP_Weight_List = []

        for i in n:
            value = sample[sample.IP.isin([IP.iloc[i,0]])]
            value2 = len(df.drop_duplicates(df(value.Consumer)))
            value3 = (1/(value2**2))
            IP_Weight_List.append(value3)

        return sum(IP_Weight_List)
Exemple #14
0
    def _compute_consistency(self):
        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df = df.drop_duplicates()
        tmp = df.groupby(['e_type', 'entity']).size().reset_index()
        tmp.rename(columns={0: 'consistency'}, inplace=True)

        return tmp
def getSubwayDate(filepath2):
    allFiles = DataFrame(os.listdir(filepath2))
    allFiles.columns = ['FileNames']
    allFiles = allFiles[allFiles.FileNames.str.slice(0, 6) == 'gtfs-2']
    allFiles['datetime1'] = allFiles.FileNames.str.slice(5)
    allFiles.datetime1 = allFiles.datetime1.map(parser.parse)
    allFiles.datetime1 = pd.DatetimeIndex(allFiles.datetime1).tz_convert('America/New_York')
    times1 = []
    for dt in allFiles.datetime1:
        tm = dt.time()
        times1.append(tm)
    allFiles['time1'] = Series(times1)
    allFiles = allFiles[allFiles.time1 >= time(6)]
    allFiles = allFiles[allFiles.time1 <= time(9, 30)]
    
    df_allTrains = DataFrame(np.zeros(0, dtype = [('current_status', 'O')]))
    df_trips = DataFrame(np.zeros(0, dtype = [('route_id', 'O')]))
    #old_df = getTrains(allFiles.FileNames.iloc[0], test_df)
    for fileName0 in allFiles.FileNames:
        fileName1 = os.path.join(filepath2,fileName0)
        try:
            df_allTrains, df_trips = getTrains(fileName1, df_allTrains, df_trips)
        except:
            print 'file ' + fileName0 + ' did not work'
            continue
        
    df_allTrains = df_allTrains.sort_values(['trip_id', 'timestamp', 'timestamp1'])
    unique_ids = ['start_date','route_id','trip_id', 'train_id', 'stop_id', 'stop_name', 'current_status', 'timestamp']
    df_allTrains.drop_duplicates(unique_ids, keep = 'last', inplace = True)
    unique_ids.remove('timestamp')
    df_grouped = df_allTrains.groupby(unique_ids)
    df_grouped1 = df_grouped['timestamp'].agg({'minTS1': min, 'maxTS1':max})
    df_grouped2 = df_grouped['timestamp1'].agg(max)
    df_grouped1['maxTS2'] = df_grouped2
    df_allTrains = df_grouped1.reset_index()
    df_allTrains.sort_values(['trip_id', 'minTS1'], inplace = True)
    
    df_trips = df_trips.sort_values(['trip_id', 'stop_id', 'timestamp1'])  
    
    shutil.rmtree(filepath2)
    return(df_allTrains, df_trips)
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
def bulk_download(year, bucket):
    bucket = expanduser(bucket)
    filepath = bucket + 'latvia' + str(year) + '.csv'

    data = DataFrame()

    for scheme in SCHEMES:
        fragment = Fragment(scheme, year, bucket)

        if not fragment.is_cached:
            fragment.download()

        fragment.load_from_csv()
        fragment.cleanup()

        data = concat([data, fragment.data], ignore_index=True)
        log.debug('Added %s rows to bulk dataframe', len(fragment.data))

    data.drop_duplicates(inplace=True)
    data.to_csv(filepath, encoding='utf-8', mode='w+')
    log.info('Bulk download saved to %s (%s rows)', filepath, len(data))
Exemple #18
0
def test_drop_duplicates_for_take_all():
    df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
                            'foo', 'bar', 'qux', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1, 1, 2, 2, 2, 2, 1, 2],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates('AAA')
    expected = df.iloc[[0, 1, 2, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep='last')
    expected = df.iloc[[2, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep=False)
    expected = df.iloc[[2, 6]]
    tm.assert_frame_equal(result, expected)

    # multiple columns
    result = df.drop_duplicates(['AAA', 'B'])
    expected = df.iloc[[0, 1, 2, 3, 4, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['AAA', 'B'], keep='last')
    expected = df.iloc[[0, 1, 2, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['AAA', 'B'], keep=False)
    expected = df.iloc[[0, 1, 2, 6]]
    tm.assert_frame_equal(result, expected)
Exemple #19
0
def test_drop_duplicates_NA_for_take_all():
    # none
    df = DataFrame({'A': [None, None, 'foo', 'bar',
                          'foo', 'baz', 'bar', 'qux'],
                    'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})

    # single column
    result = df.drop_duplicates('A')
    expected = df.iloc[[0, 2, 3, 5, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('A', keep='last')
    expected = df.iloc[[1, 4, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('A', keep=False)
    expected = df.iloc[[5, 7]]
    tm.assert_frame_equal(result, expected)

    # nan

    # single column
    result = df.drop_duplicates('C')
    expected = df.iloc[[0, 1, 5, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('C', keep='last')
    expected = df.iloc[[3, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('C', keep=False)
    expected = df.iloc[[5, 6]]
    tm.assert_frame_equal(result, expected)
Exemple #20
0
def get_components_yahoo(idx_sym):
    """
    Returns DataFrame containing list of component information for
    index represented in idx_sym from yahoo. Includes component symbol
    (ticker), exchange, and name.

    Parameters
    ----------
    idx_sym : str
        Stock index symbol
        Examples:
        '^DJI' (Dow Jones Industrial Average)
        '^NYA' (NYSE Composite)
        '^IXIC' (NASDAQ Composite)

        See: http://finance.yahoo.com/indices for other index symbols

    Returns
    -------
    idx_df : DataFrame
    """
    stats = 'snx'
    #URL of form:
    #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
    url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \
          '&e=.csv&h={2}'

    idx_mod = idx_sym.replace('^', '@%5E')
    urlStr = url.format(idx_mod, stats, 1)

    idx_df = DataFrame()
    mask = [True]
    comp_idx = 1

    #LOOP across component index structure,
    #break when no new components are found
    while (True in mask):
        urlStr = url.format(idx_mod, stats,  comp_idx)
        lines = (urllib.urlopen(urlStr).read().decode('utf-8').strip().
                 strip('"').split('"\r\n"'))

        lines = [line.strip().split('","') for line in lines]

        temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
        temp_df = temp_df.drop_duplicates()
        temp_df = temp_df.set_index('ticker')
        mask = ~temp_df.index.isin(idx_df.index)

        comp_idx = comp_idx + 50
        idx_df = idx_df.append(temp_df[mask])

    return idx_df
Exemple #21
0
def get_components_yahoo(idx_sym):
    """
    Returns DataFrame containing list of component information for
    index represented in idx_sym from yahoo. Includes component symbol
    (ticker), exchange, and name.

    Parameters
    ----------
    idx_sym : str
        Stock index symbol
        Examples:
        '^DJI' (Dow Jones Industrial Average)
        '^NYA' (NYSE Composite)
        '^IXIC' (NASDAQ Composite)

        See: http://finance.yahoo.com/indices for other index symbols

    Returns
    -------
    idx_df : DataFrame
    """
    stats = "snx"
    # URL of form:
    # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
    url = _YAHOO_COMPONENTS_URL + "s={0}&f={1}&e=.csv&h={2}"

    idx_mod = idx_sym.replace("^", "@%5E")
    url_str = url.format(idx_mod, stats, 1)

    idx_df = DataFrame()
    mask = [True]
    comp_idx = 1

    # LOOP across component index structure,
    # break when no new components are found
    while True in mask:
        url_str = url.format(idx_mod, stats, comp_idx)
        with urlopen(url_str) as resp:
            raw = resp.read()
        lines = raw.decode("utf-8").strip().strip('"').split('"\r\n"')
        lines = [line.strip().split('","') for line in lines]

        temp_df = DataFrame(lines, columns=["ticker", "name", "exchange"])
        temp_df = temp_df.drop_duplicates()
        temp_df = temp_df.set_index("ticker")
        mask = ~temp_df.index.isin(idx_df.index)

        comp_idx = comp_idx + 50
        idx_df = idx_df.append(temp_df[mask])

    return idx_df
def drop_reqpeat01():
    data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
    print data
    print data.duplicated()
    print data.drop_duplicates()
    data['v1']=range(7)
    print data.drop_duplicates(['k1'])
    print data
    print data.drop_duplicates(['k1','k2'],keep='last')
Exemple #23
0
def read_thermo_dat(model='nox', runDir=os.getcwd(), sample_int='1min', start=None, end=None):
    '''
        Reads thermo data from .dat file type.
        model = one of: nox, sox, or o3
        returns the number of files read and DataSeries containing all data munged and organized for the user for the 
        Thermo Scientific line of atmospheric gas analyzers
			
		>>>filecount, no = read_thermo_dat('o3', runDir=dataDir)
	'''

    # If the data is in a different directory, change the directory
    if os.getcwd() != runDir:
        os.chdir(runDir)
    
    # Set the model name based on input
    if model =='nox':
        instrument = '42I'
    elif model == 'sox':
        instrument = '43I'
    elif model == 'o3':
        instrument = '49I'
    else:
        sys.exit("The model you defined is not valid or supported yet.")
    
    # grab all files in the directory for a given instrument with the .dat file extension
    files = get_files(instrument,fileType='dat',start=start, end=end, runDir=runDir)
    
    fileNo = 1
    data = DataFrame()
    
    # Concatenate the data from each file together to build one big dataframe
    for each in files:
        newData = pd.read_table(each, sep='\s+', skiprows=4, header=5, parse_dates=[[1,0]], keep_date_col=True, index_col='Date_Time', warn_bad_lines=True)
        data = pd.concat([data, newData])
        fileNo += 1
      
    # Create a duplicate column containing the index to easily drop all duplicate rows from merging files containing
    #  the same data
    data['stamp'] = data.index
    data = data.drop_duplicates(cols='stamp')
    
    # Depending on the model, do some stuff to clean it up
    if model == 'nox':
        data['no2'] = data['nox'] - data['no']
        
    # resample the data based on chosen imput
    data = data.resample(sample_int)
    
    return (fileNo, data)
def PrepeareAndSave(uid, dataToSearch, data, fileName="data.csv"):
    people = []
    address = []
    poi = []
    activity = []
    timeB = []
    timeE = []
    entities = []
    taxonomy = []
    concepts = []
    txt = []

    for i in xrange(0, len(dataToSearch["people"])):
        for j in xrange(0, len(data[i][2])):
            people.append(dataToSearch["people"][i])
            address.append(dataToSearch["address"][i])
            poi.append(dataToSearch["poi"][i])
            activity.append(dataToSearch["activity"][i])
            timeB.append(dataToSearch["timeB"][i])
            timeE.append(dataToSearch["timeE"][i])

            entities.append(data[i][2][j])
            taxonomy.append(data[i][3][j])
            concepts.append(data[i][4][j])
            txt.append(base64.b64encode(data[i][1][j]))

    df = DataFrame()
    df["people"] = people
    df["address"] = address
    df["poi"] = poi
    df["activity"] = activity
    df["entities"] = entities
    df["taxonomy"] = taxonomy
    df["concepts"] = concepts
    df["timeB"] = timeB
    df["timeE"] = timeE
    df["txt"] = txt
    df = df.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)
    # print df
    try:
        df_old = read_csv(ROOT + str(uid) + "/" + fileName, ";")
        df_new = [df_old, df]
        df = pd.concat(df_new).drop(["Unnamed: 0"], axis=1)
    except:
        print ("New Data")
    df.to_csv(ROOT + str(uid) + "/" + fileName, sep=";")
    return df
def slide_10():
    data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                      'k2': [1, 1, 2, 3, 3, 4, 4]})
    print data
    print data.duplicated()
    print data.duplicated('k1')
    print data.drop_duplicates()

    data['v1'] = range(7)
    print data
    print data.drop_duplicates(['k1'])
    print data.drop_duplicates(['k1', 'k2'], take_last=True)
Exemple #26
0
    def compute_tf_idf(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])

        base_df = df[['e_type', 'entity']]
        base_df = base_df.drop_duplicates()

        doc_t_df = df.drop_duplicates().groupby('entity').size()

        results = self.cursor.execute('SELECT did, total_word FROM documents')
        tmp = results.fetchall()
        df2 = DataFrame(tmp, columns=['did', 'total_word'])

        tmp = df[['did', 'entity']].groupby(['did', 'entity']).size().reset_index()
        tmp.rename(columns={0: 'term_freq'}, inplace=True)

        tf_idf_list = []

        for row in tmp.iterrows():
            values = row[1]
            did = values[0]
            entity = values[1]
            term_freq = values[2]
            total_word = df2[df2['did'] == did]['total_word'].get_values()[0]
            tf = float(term_freq) / total_word
            doc_t = doc_t_df.get_value(entity)
            idf = np.log(total_doc / doc_t)
            tf_idf = tf * idf
            tf_idf_list.append([entity, tf_idf])

        tf_idf_df = DataFrame(tf_idf_list, columns=['entity', 'tf_idf'])
        tf_idf_df = tf_idf_df.groupby('entity').agg('sum')

        base_df.loc[:, 'tf_idf'] = base_df['entity'].apply(lambda x: tf_idf_df['tf_idf'][x])

        return base_df
def deal_string02():
    import json
    db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json'))
    print len(db)
    print db[0]
    print db[0].keys()
    print db[0]['nutrients'][0]
    nutrients=DataFrame(db[0]['nutrients'])
    print nutrients[:7]
    info_keys=['description','group','id','manufacturer']
    info=DataFrame(db,columns=info_keys)
    print info[:5]
    print pd.value_counts(info.group)[:10]

    nutrients=[]
    for rec in db:
        fnuts=DataFrame(rec['nutrients'])
        fnuts['id']=rec['id']
        nutrients.append(fnuts)
    nutrients=pd.concat(nutrients,ignore_index=True)
    print nutrients
    print nutrients.duplicated().sum()
    nutrients=nutrients.drop_duplicates()
    col_mapping={'description':'food','group':'fgroup'}
    info=info.rename(columns=col_mapping,copy=False)
    print info
    col_mapping={'description':'nutrient','group':'nutgroup'}
    nutrients=nutrients.rename(columns=col_mapping,copy=False)
    print nutrients
    ndata=pd.merge(nutrients,info,on='id',how='outer')
    print ndata
    print ndata.ix[3000]
    result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
    # print result
    result['Zinc, Zn'].sort_values().plot(kind='barh')
    by_nutrient=ndata.groupby(['nutgroup','nutrient'])
    get_maximum=lambda x:x.xs(x.value.idxmax())
    get_minimum=lambda x:x.xs(x.value.idmin())
    max_foods=by_nutrient.apply(get_maximum)[['value','food']]
    max_foods.food=max_foods.food.str[:50]
    print max_foods.ix['Amino Acids']['food']
Exemple #28
0
def hierarchical_clusters( log, show_plot=None ):
    """Translates traces to Parikh vectors and computes in the vector space
       a hierarchical clustering."""
    def get_parikh(case,alphabet):
        v = zeros(len(alphabet),dtype=int)
        for act in case:
            v[alphabet[act]] = v[alphabet[act]] +1
        # canonical representation
        m = min(v)
        return v - m   
    
    actsind = {}
    i = 0
    for act in log.get_alphabet():
        actsind[act] = i
        i = i +1

    uniq_cases = log.get_uniq_cases()
    N = len(uniq_cases)
    M = len(actsind)
    data = zeros((N,M),dtype=int)
    i = 0
    parikhdict = {}
    for case in uniq_cases.keys():
        data[i] = get_parikh(case,actsind)
        str_i = ','.join(map(str,data[i]))
        if str_i not in parikhdict:
            parikhdict[str_i] = [i]
        else:
            parikhdict[str_i].append(i)
        i = i + 1
    df = DataFrame(data)
    data_uniq = df.drop_duplicates()
    Y = pdist(data_uniq,metric='euclidean')
    Z = linkage(Y,method='average')
    dendrogram(Z)
    show()
Exemple #29
0
def fix_event_type(df: DataFrame):
    '''
    Not sure yet.
    :param df: Dataframe object.
    :return: Modified Dataframe.
    '''

    a = time.time()

    colsf = df['id'].ravel()            # list of all IDs
    unique = pd.Series(colsf).unique()  # get unique IDs
    u_counts = []                       # list of unique counts (UNUSED)
    counts_bucket = []                  # bucket of counts (UNUSED)
    df = pd.get_dummies(df)             # create dummy variables
    todrop = df.sum() < 50              # get columns where sum of dummy column < 50
    dropcols = df.columns[todrop]       # get those column names
    df = df.drop(dropcols, axis=1)      # drop those columns
    df['num_events'] = 0                # create number of events columns, set to 0
    # print(df.columns)
    print(str(len(unique)))

    for ii in range(0,len(unique)):     # loop through all the unique IDs
        subset = df.loc[df['id'] == unique[ii]]     # subset by that ID
        the_dummies = subset.columns != 'id'        # get all columns that do not equal that ID
        aa = subset.iloc[:, subset.columns != 'id'].sum().tolist()  # get all of those columns to list
        event_sum = np.sum(aa)      # sum all of those
        
        # aa = aa.set_index([[subset.index[0]]])
        # subset.iloc[:,subset.columns != 'id'] = aa
        df = df.set_value(subset.index, the_dummies, aa)
        df = df.set_value(subset.index, 'num_events', event_sum)
        # df.loc[subset.index] = subset
    df = df.drop_duplicates('id')
    print(df)
    b = time.time()
    print(b-a)
    return df
Exemple #30
0
class DropDuplicates(object):

    goal_time = 0.2
    params = [True, False]
    param_names = ['inplace']

    def setup(self, inplace):
        N = 10000
        K = 10
        key1 = tm.makeStringIndex(N).values.repeat(K)
        key2 = tm.makeStringIndex(N).values.repeat(K)
        self.df = DataFrame({'key1': key1, 'key2': key2,
                             'value': np.random.randn(N * K)})
        self.df_nan = self.df.copy()
        self.df_nan.iloc[:10000, :] = np.nan

        self.s = Series(np.random.randint(0, 1000, size=10000))
        self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))

        N = 1000000
        K = 10000
        key1 = np.random.randint(0, K, size=N)
        self.df_int = DataFrame({'key1': key1})
        self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
                                                   dtype=bool))

    def time_frame_drop_dups(self, inplace):
        self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_frame_drop_dups_na(self, inplace):
        self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_series_drop_dups_int(self, inplace):
        self.s.drop_duplicates(inplace=inplace)

    def time_series_drop_dups_string(self, inplace):
        self.s_str.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_int(self, inplace):
        self.df_int.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_bool(self, inplace):
        self.df_bool.drop_duplicates(inplace=inplace)
Exemple #31
0
 def _drop_duplicates_by_timestamp(self,
                                   heating_obj_df: pd.DataFrame
                                   ) -> pd.DataFrame:
     return heating_obj_df.drop_duplicates(column_names.TIMESTAMP, ignore_index=True, keep="last")
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

DF_obj=DataFrame({'column 1':[1,1,2,2,3,3,3],
                 'column 2':['a','a','b','b','c','c','c'],
                 'column 3':['A','A','B','B','C','C','C']})
print(DF_obj)

print(DF_obj.duplicated())
DF_obj=DF_obj.drop_duplicates()
print(DF_obj)

print(DF_obj.drop_duplicates(['column 3']))
def load_terms(df: DataFrame, term_creation_mode: str = 'ignore') -> int:
    """Creates Term objects from an input Pandas DataFrame and adds the newly created Terms to the database.
    
    Args:
        df (pandas.DataFrame): Input data. Contains columns:

            * 'Term Locale'
            * 'Term Category'
            * 'Term'
            * 'Case Sensitive'.

        term_creation_mode (str): A logical flag for handling duplicate Term conflicts.
            Can be 'add', 'ignore', or 'replace'. Defaults to 'ignore'.
            Only relevant if a Term already exists in the database.

            * 'add': adds the new Terms while keeping the existing Terms.
            * 'ignore': skips adding new Terms.
            * 'replace': deletes the old Term(s) and then adds the new Term.

    Returns:
        int: The number of Terms added to the database.

    Raises:
        ValueError: invalid `term_creation_mode` argument.
    """
    @unique
    class _TermCreationMode(Enum):
        ADD = 'add'
        IGNORE = 'ignore'
        REPLACE = 'replace'

        @classmethod
        def get_modes(self, *args) -> list:
            def _ga(mode, *args) -> tuple:
                return tuple([getattr(mode, arg) for arg in args])

            if len(args) == 1:
                return [getattr(mode, *args) for mode in self]
            elif len(args) == 0:
                args = ('name', 'value')
            return [_ga(mode, args) for mode in self]

    def _prepare_term(
            dataframe_row: Series,
            term_creation_mode: _TermCreationMode) -> Union[Term, None]:
        """Instantiates and returns a new Term objects based on input data.

        Args:
            dataframe_row (pandas.Series): Input data for Term creation.
            term_creation_mode (_TermCreationMode): A logical flag; determines return behavior.

        Returns:
            new_db_term (Term): A new Term object from the input row.
            None: if term_creation_mode is IGNORE.
        """
        term = dataframe_row['Term'].strip()
        qs_terms_of_this_term = Term.objects.filter(term=term)

        # instantiate a new Term
        new_db_term = Term(term=term,
                           source=dataframe_row['Term Category'],
                           definition_url=dataframe_row['Term Locale'])

        # handle term creation modes
        if qs_terms_of_this_term.exists():
            if term_creation_mode == _TermCreationMode.REPLACE:
                qs_terms_of_this_term.delete()
            elif term_creation_mode == _TermCreationMode.IGNORE:
                return None
        return new_db_term

    try:  # set term creation mode
        term_creation_mode = _TermCreationMode(term_creation_mode)
    except ValueError:  # raise custom argument error
        raise ValueError(
            f"Argument `term_creation_mode` must be one of: {_TermCreationMode.get_modes('values')}"
        )

    # preprocess DataFrame
    df.drop_duplicates(inplace=True)
    df.loc[df['Case Sensitive'] == False,
           'Term'] = df.loc[df['Case Sensitive'] == False, 'Term'].str.lower()
    df = df.drop_duplicates(subset='Term').dropna(subset=['Term'])

    # create new Terms. Filter: if _prepare_term() returns None, it is not added to this list
    new_db_terms = list(
        filter(None, (_prepare_term(row, term_creation_mode)
                      for _, row in df.iterrows())))

    # cache "global" term stems step - should be cached here via model manager
    Term.objects.bulk_create(new_db_terms)

    return len(new_db_terms)
Exemple #34
0
def test_drop_duplicates_NA():
    # none
    df = DataFrame({
        "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
        "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
        "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
        "D":
        range(8),
    })
    # single column
    result = df.drop_duplicates("A")
    expected = df.loc[[0, 2, 3]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("A", keep="last")
    expected = df.loc[[1, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("A", keep=False)
    expected = df.loc[[]]  # empty df
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    result = df.drop_duplicates(["A", "B"])
    expected = df.loc[[0, 2, 3, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(["A", "B"], keep="last")
    expected = df.loc[[1, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(["A", "B"], keep=False)
    expected = df.loc[[6]]
    tm.assert_frame_equal(result, expected)

    # nan
    df = DataFrame({
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
        "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
        "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
        "D":
        range(8),
    })
    # single column
    result = df.drop_duplicates("C")
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("C", keep="last")
    expected = df.loc[[3, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("C", keep=False)
    expected = df.loc[[]]  # empty df
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    result = df.drop_duplicates(["C", "B"])
    expected = df.loc[[0, 1, 2, 4]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(["C", "B"], keep="last")
    expected = df.loc[[1, 3, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(["C", "B"], keep=False)
    expected = df.loc[[1]]
    tm.assert_frame_equal(result, expected)
Exemple #35
0
def test_drop_duplicates():
    df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
                            'foo', 'bar', 'bar', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1, 1, 2, 2, 2, 2, 1, 2],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates('AAA')
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep='last')
    expected = df.loc[[6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep=False)
    expected = df.loc[[]]
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    expected = df.loc[[0, 1, 2, 3]]
    result = df.drop_duplicates(np.array(['AAA', 'B']))
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates(['AAA', 'B'])
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AAA', 'B'), keep='last')
    expected = df.loc[[0, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AAA', 'B'), keep=False)
    expected = df.loc[[0]]
    tm.assert_frame_equal(result, expected)

    # consider everything
    df2 = df.loc[:, ['AAA', 'B', 'C']]

    result = df2.drop_duplicates()
    # in this case only
    expected = df2.drop_duplicates(['AAA', 'B'])
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep='last')
    expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep=False)
    expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
    tm.assert_frame_equal(result, expected)

    # integers
    result = df.drop_duplicates('C')
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates('C', keep='last')
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    df['E'] = df['C'].astype('int8')
    result = df.drop_duplicates('E')
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates('E', keep='last')
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    # GH 11376
    df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
                    'y': [0, 6, 5, 5, 9, 1, 2]})
    expected = df.loc[df.index != 3]
    tm.assert_frame_equal(df.drop_duplicates(), expected)

    df = DataFrame([[1, 0], [0, 2]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-2, 0], [0, -4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    x = np.iinfo(np.int64).max / 3 * 2
    df = DataFrame([[-x, x], [0, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-x, x], [x, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    # GH 11864
    df = DataFrame([i] * 9 for i in range(16))
    df = df.append([[1] + [0] * 8], ignore_index=True)

    for keep in ['first', 'last', False]:
        assert df.duplicated(keep=keep).sum() == 0
series_obj = Series(
    ['row 1', missing, 'row 3', 'row 4', 'row 5', missing, 'row 7', 'row 8'])

print(series_obj)

DF_obj = DataFrame({
    'column 1': [1, 1, 2, 2, 3, 3, 3],
    'column 2': ['a', 'a', 'b', 'b', 'c', 'c', 'c'],
    'column 3': ['A', 'A', 'B', 'B', 'C', 'C', 'C']
})

print(DF_obj)
print(DF_obj.duplicated())  #Check duplicates

print(DF_obj.drop_duplicates())

DF_obj = DataFrame({
    'column 1': [1, 1, 2, 2, 3, 3, 3],
    'column 2': ['a', 'a', 'b', 'b', 'c', 'c', 'c'],
    'column 3': ['A', 'A', 'B', 'B', 'C', 'C', 'C']
})

print(DF_obj)

print(DF_obj.drop_duplicates(
    ['column 3']))  #look for dupicates in col 3 and drop those rows

####################################Concatenate###########################################

DF_obj = DataFrame(np.arange(36).reshape(6, 6))
def df_drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates(
        subset=[Column.YM_TRACK_ID.value, Column.YM_ALBUM_ID.value])
Exemple #38
0
# 现在,为了对全部营养数据做一些分析,最简单的办法是将所有食物的营养成分整合到一个大表中
# 我们分几个步骤来实现该目的
# 首先,将各食物的营养成分列表转换为一个DataFrame,并添加一个表示编号的列,
# 然后将该DataFrame添加到一个列表中
# 最后通过concat将这些东西连接起来就可以了
nutrients = []
for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
# 如果一切顺利的话,nutrients应该是下面这样的
print(nutrients)
# 我们发现这个DataFrame中无论如何都会有一些重复项,所以直接丢弃就可以了
print(nutrients.duplicated().sum())
nutrients = nutrients.drop_duplicates()
# 由于两个DataFrame对象中都有“group”和“description”,所以为了明确到底谁是谁
# 需要对它们进行重命名
col_mapping = {'description': 'food', 'group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
print(info)
col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients)
# 做完这些事情之后,就可以将info和nutrients合并起来
ndata = pd.merge(nutrients, info, on='id', how='outer')
print(ndata)
print(ndata.ix[3000])
# 根据食物分类和营养类型画出一张中位值图
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].order().plot(kind='barh')
Exemple #39
0
def df_to_db(df: pd.DataFrame,
             ref_df: pd.DataFrame,
             region: Region,
             data_schema: DeclarativeMeta,
             provider: Provider,
             drop_duplicates: bool = True,
             fix_duplicate_way: str = 'ignore',
             force_update=False) -> object:
    step1 = time.time()
    precision_str = '{' + ':>{},.{}f'.format(8, 4) + '}'

    if not pd_is_not_null(df):
        return 0

    if drop_duplicates and df.duplicated(subset='id').any():
        df.drop_duplicates(subset='id', keep='last', inplace=True)

    schema_cols = get_schema_columns(data_schema)
    cols = list(set(df.columns.tolist()) & set(schema_cols))

    if not cols:
        logger.error("{} get columns failed".format(data_schema.__tablename__))
        return 0

    df = df[cols]

    # force update mode, delete duplicate id data in db, and write new data back
    if force_update:
        ids = df["id"].tolist()
        if len(ids) == 1:
            sql = f"delete from {data_schema.__tablename__} where id = '{ids[0]}'"
        else:
            sql = f"delete from {data_schema.__tablename__} where id in {tuple(ids)}"

        session = get_db_session(region=region,
                                 provider=provider,
                                 data_schema=data_schema)
        session.execute(sql)
        session.commit()
        df_new = df

    else:
        if ref_df is None:
            ref_df = get_data(region=region,
                              provider=provider,
                              columns=['id', 'timestamp'],
                              data_schema=data_schema,
                              return_type='df')
        if ref_df.empty:
            df_new = df
        else:
            df_new = df[~df.id.isin(ref_df.index)]

        # 不能单靠ID决定是否新增,要全量比对
        # if fix_duplicate_way == 'add':
        #     df_add = df[df.id.isin(ref_df.index)]
        #     if not df_add.empty:
        #         df_add.id = uuid.uuid1()
        #         df_new = pd.concat([df_new, df_add])

    cost = precision_str.format(time.time() - step1)
    logger.debug("remove duplicated: {}".format(cost))

    saved = 0
    if pd_is_not_null(df_new):
        saved = to_postgresql(region, df_new, data_schema.__tablename__)

    cost = precision_str.format(time.time() - step1)
    logger.debug("write db: {}, size: {}".format(cost, saved))

    return saved
Exemple #40
0
# bunch of keys for testing
keys: List[Any] = [
    np.random.randint(0, 11, m),
    np.random.choice(list("abcdefghijk"), m),
    np.random.choice(pd.date_range("20141009", periods=11).tolist(), m),
    np.random.choice(list("ZYXWVUTSRQP"), m),
]
keys = list(map(tuple, zip(*keys)))
keys += list(map(lambda t: t[:-1], vals[:: n // m]))


# covers both unique index and non-unique index
df = DataFrame(vals, columns=cols)
a = pd.concat([df, df])
b = df.drop_duplicates(subset=cols[:-1])


def validate(mi, df, key):
    # check indexing into a multi-index before & past the lexsort depth

    mask = np.ones(len(df)).astype("bool")

    # test for all partials of this key
    for i, k in enumerate(key):
        mask &= df.iloc[:, i] == k

        if not mask.any():
            assert key[: i + 1] not in mi.index
            continue
Exemple #41
0
def _remove_duplicate_entries(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    logger.info('Removing duplicate entries')
    df.drop_duplicates(subset=[column_name], keep='first', inplace=True)
    return df
Exemple #42
0
class DFTrack:
    def __init__(self, df_points=None, columns=None):
        if df_points is None:
            self.df = DataFrame()

        if isinstance(df_points, pd.DataFrame):
            self.df = df_points
        else:
            if columns is None:
                columns = [
                    'CodeRoute', 'Latitude', 'Longitude', 'Altitude', 'Date',
                    'Speed', 'TimeDifference', 'Distance', 'FileName'
                ]
            self.df = DataFrame(df_points, columns=columns)

    def export(self, filename='exported_file', export_format='csv'):
        """
		Export a data frame of DFTrack to JSON or CSV.

		Parameters
		----------
		export_format: string
			Format to export: JSON or CSV
		filename: string
			Name of the exported file
		"""
        if export_format.lower() == 'json':
            self.df.reset_index().to_json(orient='records',
                                          path_or_buf=filename + '.json')
        elif export_format.lower() == 'csv':
            self.df.to_csv(path_or_buf=filename + '.csv')
        else:
            raise TrackException('Must specify a valid format to export',
                                 "'%s'" % export_format)

    def getTracks(self):
        """
		Makes a copy of the DFTrack.

		Explanation: 
			http://stackoverflow.com/questions/27673231/why-should-i-make-a-copy-of-a-data-frame-in-pandas

		Returns
		-------
		copy: DFTrack
			The copy of DFTrack.
		"""
        return self.__class__(self.df.copy(), list(self.df))

    def sort(self, column_name):
        """
		Sorts the data frame by the specified column.

		Parameters
		----------
		column_name: string
			Column name to sort

		Returns
		-------
		sort: DFTrack
			DFTrack sorted
		"""
        if isinstance(column_name, list):
            for column in column_name:
                if column not in self.df:
                    raise TrackException('Column name not found',
                                         "'%s'" % column)
        else:
            if column_name not in self.df:
                raise TrackException('Column name not found',
                                     "'%s'" % column_name)

        return self.__class__(self.df.sort_values(column_name), list(self.df))

    def getTracksByPlace(self, place, timeout=10, only_points=True):
        """
		Gets the points of the specified place searching in Google's API
		and, if it does not get anything, it tries with OpenStreetMap's API.

		Parameters
		----------
		place: string
			Place to get the points
		timeout: int
			Time, in seconds, to wait for the geocoding service to respond 
			before returning a None value.
		only_points: boolean
			True to retrieve only the points that cross a place. False to
			retrive all the points of the tracks that cross a place.

		Returns
		-------
		place: DFTrack
			A DFTrack with the points of the specified place or 
			None if anything is found.
		"""
        track_place = self.getTracksByPlaceGoogle(place,
                                                  timeout=timeout,
                                                  only_points=only_points)
        if track_place is not None:
            return track_place

        track_place = self.getTracksByPlaceOSM(place,
                                               timeout=timeout,
                                               only_points=only_points)
        if track_place is not None:
            return track_place

        return None

    def getTracksByPlaceGoogle(self, place, timeout=10, only_points=True):
        """
		Gets the points of the specified place searching in Google's API.

		Parameters
		----------
		place: string
			Place to get the points
		timeout: int
			Time, in seconds, to wait for the geocoding service to respond 
			before returning a None value.
		only_points: boolean
			True to retrieve only the points that cross a place. False to
			retrive all the points of the tracks that cross a place.

		Returns
		-------
		place: DFTrack
			A DFTrack with the points of the specified place or 
			None if anything is found.
		"""
        try:
            geolocator = geopy.GoogleV3()
            location = geolocator.geocode(place, timeout=timeout)
        except geopy.exc.GeopyError as geo_error:
            return None

        southwest_lat = float(
            location.raw['geometry']['bounds']['southwest']['lat'])
        northeast_lat = float(
            location.raw['geometry']['bounds']['northeast']['lat'])
        southwest_lng = float(
            location.raw['geometry']['bounds']['southwest']['lng'])
        northeast_lng = float(
            location.raw['geometry']['bounds']['northeast']['lng'])

        df_place = self.df[(self.df['Latitude'] < northeast_lat)
                           & (self.df['Longitude'] < northeast_lng) &
                           (self.df['Latitude'] > southwest_lat) &
                           (self.df['Longitude'] > southwest_lng)]

        if only_points:
            return self.__class__(df_place)

        track_list = df_place['CodeRoute'].unique().tolist()
        return self.__class__(self.df[self.df['CodeRoute'].isin(track_list)])

    def getTracksByPlaceOSM(self, place, timeout=10, only_points=True):
        """
		Gets the points of the specified place searching in OpenStreetMap's API.

		Parameters
		----------
		place: string
			Place to get the points
		timeout: int
			Time, in seconds, to wait for the geocoding service to respond 
			before returning a None value.
		only_points: boolean
			True to retrieve only the points that cross a place. False to
			retrive all the points of the tracks that cross a place.

		Returns
		-------
		place: DFTrack
			A DFTrack with the points of the specified place or 
			None if anything is found.
		"""
        try:
            geolocator = geopy.Nominatim()
            location = geolocator.geocode(place, timeout=timeout)
        except geopy.exc.GeopyError as geo_error:
            return None

        southwest_lat = float(location.raw['boundingbox'][0])
        northeast_lat = float(location.raw['boundingbox'][1])
        southwest_lng = float(location.raw['boundingbox'][2])
        northeast_lng = float(location.raw['boundingbox'][3])

        df_place = self.df[(self.df['Latitude'] < northeast_lat)
                           & (self.df['Longitude'] < northeast_lng) &
                           (self.df['Latitude'] > southwest_lat) &
                           (self.df['Longitude'] > southwest_lng)]

        if only_points:
            return self.__class__(df_place)

        track_list = df_place['CodeRoute'].unique().tolist()
        return self.__class__(self.df[self.df['CodeRoute'].isin(track_list)])

    def getTracksByDate(self, start=None, end=None, periods=None, freq='D'):
        """
		Gets the points of the specified date range 
		using various combinations of parameters.

		2 of 'start', 'end', or 'periods' must be specified.

		Date format recommended: 'yyyy-mm-dd'

		Parameters
		----------
		start: date
			Date start period
		end: date
			Date end period
		periods: int
			Number of periods. If None, must specify 'start' and 'end'
		freq: string
			Frequency of the date range

		Returns
		-------
		df_date: DFTrack
			A DFTrack with the points of the specified date range.
		"""
        if trk_utils.isTimeFormat(start) or trk_utils.isTimeFormat(end):
            raise TrackException('Must specify an appropiate date format',
                                 'Time format found')

        rng = pd.date_range(start=start, end=end, periods=periods, freq=freq)

        df_date = self.df.copy()
        df_date['Date'] = pd.to_datetime(df_date['Date'])
        df_date['ShortDate'] = df_date['Date'].apply(
            lambda date: date.date().strftime('%Y-%m-%d'))
        df_date = df_date[df_date['ShortDate'].apply(lambda date: date in rng)]
        del df_date['ShortDate']

        df_date = df_date.reset_index(drop=True)

        return self.__class__(df_date, list(df_date))

    def getTracksByTime(self,
                        start,
                        end,
                        include_start=True,
                        include_end=True):
        """
		Gets the points between the specified time range.

		Parameters
		----------
		start: datetime.time
			Time start period
		end: datetime.time
			Time end period
		include_start: boolean
		include_end: boolean
			
		Returns
		-------
		df_time: DFTrack
			A DFTrack with the points of the specified date and time periods.
		"""
        if not trk_utils.isTimeFormat(start) or not trk_utils.isTimeFormat(
                end):
            raise TrackException('Must specify an appropiate time format',
                                 trk_utils.TIME_FORMATS)

        df_time = self.df.copy()

        index = pd.DatetimeIndex(df_time['Date'])
        df_time = df_time.iloc[index.indexer_between_time(
            start_time=start,
            end_time=end,
            include_start=include_start,
            include_end=include_end)]

        df_time = df_time.reset_index(drop=True)

        return self.__class__(df_time, list(df_time))

    def pointVideoNormalize(self):
        df = self.df.copy()

        df_norm = pd.DataFrame()
        group_size = df.groupby('CodeRoute').size()
        max_value = group_size.max()
        name_max_value = group_size.idxmax()

        grouped = df['CodeRoute'].unique()

        for name in tqdm(grouped, desc='Groups'):
            df_slice = df[df['CodeRoute'] == name]
            df_slice = df_slice.reset_index(drop=True)
            div = int(max_value / len(df_slice)) + 1
            df_index = DataFrame(df_slice.index)
            df_slice['VideoFrame'] = df_index.apply(
                lambda x: x + 1 if name_max_value == name else x * div)
            df_norm = pd.concat([df_norm, df_slice])

        df_norm = df_norm.reset_index(drop=True)

        return self.__class__(df_norm, list(df_norm))

    def timeVideoNormalize(self, time, framerate=5):
        df = self.df.copy()
        if time == 0:
            df['VideoFrame'] = 0
            df = df.reset_index(drop=True)
            return self.__class__(df, list(df))

        n_fps = time * framerate
        df = df.sort_values('Date')
        df_cum = trk_utils.calculateCumTimeDiff(df)
        grouped = df_cum['CodeRoute'].unique()

        df_norm = pd.DataFrame()
        point_idx = 1

        for name in tqdm(grouped, desc='Groups'):
            df_slice = df_cum[df_cum['CodeRoute'] == name]
            time_diff = float(
                (df_slice[['TimeDifference']].sum() / time) /
                framerate)  # Track duration divided by time and framerate

            df_range = df_slice[df_slice['CumTimeDiff'] == 0]
            df_range = df_range.reset_index(drop=True)
            df_range['VideoFrame'] = 0
            df_norm = pd.concat([df_norm, df_range])

            for i in tqdm(range(1, n_fps + 1), desc='Num FPS', leave=False):
                x_start = time_diff * (i - 1)
                x_end = time_diff * i

                df_range = df_slice[(df_slice['CumTimeDiff'] > x_start)
                                    & (df_slice['CumTimeDiff'] <= x_end)]
                df_range = df_range.reset_index(drop=True)

                if df_range.empty:
                    df_start = df_slice[
                        df_slice['CumTimeDiff'] <= x_start].tail(1)
                    df_end = df_slice[df_slice['CumTimeDiff'] > x_end].head(1)

                    if not df_start.empty and not df_end.empty:
                        df_middlePoint = trk_utils.getPointInTheMiddle(
                            df_start, df_end, time_diff, point_idx)
                        df_range = DataFrame(df_middlePoint,
                                             columns=list(df_cum))

                    point_idx = point_idx + 1
                else:
                    point_idx = 1

                df_range['VideoFrame'] = i
                df_norm = pd.concat([df_norm, df_range])
        df_norm = df_norm.reset_index(drop=True)

        return self.__class__(df_norm, list(df_norm))

    def setColors(self, column_name, individual_tracks=True):
        if column_name not in self.df:
            raise TrackException('Column name not found', "'%s'" % column_name)

        df = self.df.copy()

        df_colors = pd.DataFrame()

        if individual_tracks:
            grouped = df['CodeRoute'].unique()

            for name in grouped:
                df_slice = df[df['CodeRoute'] == name]
                df_slice = df_slice.reset_index(drop=True)

                min = df_slice[column_name].min()
                max = df_slice[column_name].max()

                df_slice['Color'] = df_slice[column_name].apply(trk_utils.rgb,
                                                                minimum=min,
                                                                maximum=max)
                df_colors = pd.concat([df_colors, df_slice])

            df_colors = df_colors.reset_index(drop=True)
            return self.__class__(df_colors, list(df_colors))
        else:
            min = df[column_name].min()
            max = df[column_name].max()

            df['Color'] = df[column_name].apply(trk_utils.rgb,
                                                minimum=min,
                                                maximum=max)
            df = df.reset_index(drop=True)

            return self.__class__(df, list(df))

    def dropDuplicates(self):
        """
		Drop points of the same track with the same Latitude and Longitude.
		"""
        return self.__class__(
            self.df.drop_duplicates(['CodeRoute', 'Latitude', 'Longitude']))

    def toDict(self):
        """
		Convert de data frame to a dictionary 
		like [{column -> value}, ... , {column -> value}]
		"""
        return self.df.to_dict('records')

    def getBounds(self):
        """
		Get the bounds of the DFTrack
			
		Returns
		-------
		bounds: gpxpy.GPXBounds
		"""
        min_lat = self.df['Latitude'].min()
        max_lat = self.df['Latitude'].max()
        min_lng = self.df['Longitude'].min()
        max_lng = self.df['Longitude'].max()

        return GPXBounds(min_lat, max_lat, min_lng, max_lng)

    def concat(self, dfTrack):
        """
		Concatenate DFTrack objects with 'self'

		Parameters
		----------
		dfTrack: DFTrack or list of DFTrack
			The ones that will be joined with 'self'
			
		Returns
		-------
		df_concat: DFTrack
			A DFTrack with the all the DFTrack concatenated
		"""
        if not isinstance(dfTrack, list):
            # If it is not a list of DFTrack, make a list of one element
            dfTrack = [dfTrack]

        df_concat = [self.df]  # First element is 'self'

        # From list of 'dfTrack', create a list of their dataframes
        for df in dfTrack:
            if not isinstance(df, DFTrack):
                raise TrackException("Parameter must be a 'DFTrack' object",
                                     '%s found' % type(df))

            df_concat.append(df.df)

        return self.__class__(pd.concat(df_concat))
Exemple #43
0
# 6     6    b
# 7     6    b


print(dupli_data.duplicated())
# 0    False
# 1    False
# 2    False
# 3    False
# 4    False
# 5     True
# 6    False
# 7     True
# dtype: bool

dupli_data = dupli_data.drop_duplicates()

print(dupli_data)
#    col1 col2
# 0     1    a
# 1     1    b
# 2     2    b
# 3     3    b
# 4     4    c
# 6     6    b


city_map ={"서울":"서울", 
           "부산":"경상도", 
           "대전":"충청도", 
           "광주":"전라도",
Exemple #44
0
def test_drop_duplicates():
    df = DataFrame({
        "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
        "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
        "C": [1, 1, 2, 2, 2, 2, 1, 2],
        "D":
        range(8),
    })
    # single column
    result = df.drop_duplicates("AAA")
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("AAA", keep="last")
    expected = df.loc[[6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("AAA", keep=False)
    expected = df.loc[[]]
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    expected = df.loc[[0, 1, 2, 3]]
    result = df.drop_duplicates(np.array(["AAA", "B"]))
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates(["AAA", "B"])
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(("AAA", "B"), keep="last")
    expected = df.loc[[0, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(("AAA", "B"), keep=False)
    expected = df.loc[[0]]
    tm.assert_frame_equal(result, expected)

    # consider everything
    df2 = df.loc[:, ["AAA", "B", "C"]]

    result = df2.drop_duplicates()
    # in this case only
    expected = df2.drop_duplicates(["AAA", "B"])
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep="last")
    expected = df2.drop_duplicates(["AAA", "B"], keep="last")
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep=False)
    expected = df2.drop_duplicates(["AAA", "B"], keep=False)
    tm.assert_frame_equal(result, expected)

    # integers
    result = df.drop_duplicates("C")
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates("C", keep="last")
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    df["E"] = df["C"].astype("int8")
    result = df.drop_duplicates("E")
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates("E", keep="last")
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    # GH 11376
    df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
    expected = df.loc[df.index != 3]
    tm.assert_frame_equal(df.drop_duplicates(), expected)

    df = DataFrame([[1, 0], [0, 2]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-2, 0], [0, -4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    x = np.iinfo(np.int64).max / 3 * 2
    df = DataFrame([[-x, x], [0, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-x, x], [x, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    # GH 11864
    df = DataFrame([i] * 9 for i in range(16))
    df = df.append([[1] + [0] * 8], ignore_index=True)

    for keep in ["first", "last", False]:
        assert df.duplicated(keep=keep).sum() == 0
                            print("(attempt " + str(i + 1) + "/" + str(CHUNK_ATTEMPTS) + ")")

                            if i < CHUNK_ATTEMPTS - 1:
                                time.sleep(CHUNK_ATTEMPT_SLEEP)
                    # time taken for this loop to execute
                    # print("time taken to execute loop took ", time.time() - start_time)
                    if error != None:
                        sys.exit("max retries exceeded.")

            # f.write(json.dumps(messages))
            # f.close()
            # print()
            # print("saving to " + file_name)
            # print("done2!")

            dic2['vid'] = vid
            dic2['commenter'] = commenter
            dic2['time'] = time_stamp
            dic2['comment'] = comment_list

            df2 = DataFrame(data = dic2)
            df2 = df2.append(df2)
            df2.drop_duplicates(subset = None, inplace = True)
            df2.to_csv(chosenDirectory+"/esports_videos_comments/" + name[0] + "_" + name[1] + "_" + str(time.time()) +  "_comments.csv", sep=',')

            # Save for each channel
            # df = DataFrame(data = details)
            # df.to_csv(chosenDirectory+"/channel_comments/" + channel + "_most_viewes.csv")
        except:
            print(str(datetime.datetime.now())+": Video broken or something")
Exemple #46
0
# data drop by column or index
base.chapter('data drop by column & index')
df3 = df2.drop(columns=['chinese'])
df4 = df3.drop(index=['Zhangfei'])
print("df3:\n", df3)
print("df4:\n", df4)

# rename columns
base.chapter('rename columns')
# same as df2.rename(columns={'chinese':'Yuwen', 'english':'Yingyu'}, inplace=True)
df3 = df2.rename(columns={'chinese':'Yuwen', 'english':'Yingyu'}, inplace=False)
print("df2:\n", df3)

# drop duplicates
base.chapter('drop duplicate')
df1 = df1.drop_duplicates()

# space character strip
base.chapter('space character strip')
# strip & lstrip & rstrip
#df2['chinese'] = df2['chinese'].map(str.strip)

# upper & lower & title
df2.columns = df2.columns.str.upper()
df2.columns = df2.columns.str.lower()

# find null
data = {
    "chinese": [66, 95, 93, 90, 80],
    "english": [65, None, 92, 88, 90],
    "math": [30, 98, 96, None, 90],
Exemple #47
0
def detect_ts(df,
              max_anoms=0.10,
              direction='pos',
              alpha=0.05,
              threshold=None,
              e_value=False,
              longterm=False,
              piecewise_median_period_weeks=2,
              granularity='day',
              verbose=False,
              inplace=True):
    """
    Anomaly Detection Using Seasonal Hybrid ESD Test
    A technique for detecting anomalies in seasonal univariate time series where the input is a
    series of <timestamp, value> pairs.

    Args:

    x: Time series as a two column data frame where the first column consists of the integer UTC Unix
    timestamps and the second column consists of the observations.

    max_anoms: Maximum number of anomalies that S-H-ESD will detect as a percentage of the
    data.

    direction: Directionality of the anomalies to be detected. Options are: ('pos' | 'neg' | 'both').

    alpha: The level of statistical significance with which to accept or reject anomalies.

    only_last: Find and report anomalies only within the last day or hr in the time series. Options: (None | 'day' | 'hr')

    threshold: Only report positive going anoms above the threshold specified. Options are: (None | 'med_max' | 'p95' | 'p99')

    e_value: Add an additional column to the anoms output containing the expected value.

    longterm: Increase anom detection efficacy for time series that are greater than a month.

    See Details below.
    piecewise_median_period_weeks: The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal
    (2014). Defaults to 2.

    Details


    'longterm' This option should be set when the input time series is longer than a month.
    The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014).
    'threshold' Filter all negative anomalies and those anomalies whose magnitude is smaller
    than one of the specified thresholds which include: the median
    of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the
    99th percentile of the daily max values (p99).

    The returned value is a dictionary with the following components:
      anoms: Data frame containing timestamps, values, and optionally expected values.
      plot: A graphical object if plotting was requested by the user. The plot contains
      the estimated anomalies annotated on the input time series
    """

    if not isinstance(df, DataFrame):
        raise ValueError("data must be a single data frame.")
    else:
        if len(df.columns) != 2 or not df.iloc[:, 1].map(np.isreal).all():
            raise ValueError(
                '''data must be a 2 column data.frame, with the first column being a set of timestamps, and
                                the second coloumn being numeric values.''')

        if not (df.dtypes[0].type is np.float64) and not (df.dtypes[0].type is
                                                          np.int64):
            raise ValueError(
                '''The input timestamp column must be a float or integer of the unix timestamp, not date
                                time columns, date strings or pd.TimeStamp columns.'''
            )

    if not inplace:
        df = copy.deepcopy(df)

    # change the column names in place, rather than copying the entire dataset, but save the headers to replace them.
    orig_header = df.columns.values
    df.rename(columns={
        df.columns.values[0]: "timestamp",
        df.columns.values[1]: "value"
    },
              inplace=True)

    # Sanity check all input parameters
    if max_anoms > 0.49:
        length = len(df.value)
        raise ValueError(
            "max_anoms must be less than 50%% of the data points (max_anoms =%f data_points =%s)."
            % (round(max_anoms * length, 0), length))

    if direction not in ['pos', 'neg', 'both']:
        raise ValueError("direction options are: pos | neg | both.")

    if not (0.01 <= alpha or alpha <= 0.1):
        if verbose:
            import warnings
            warnings.warn(
                "alpha is the statistical signifigance, and is usually between 0.01 and 0.1"
            )

    if threshold not in [None, 'med_max', 'p95', 'p99']:
        raise ValueError("threshold options are: None | med_max | p95 | p99")

    if not isinstance(e_value, bool):
        raise ValueError("e_value must be a boolean")

    if not isinstance(longterm, bool):
        raise ValueError("longterm must be a boolean")

    if piecewise_median_period_weeks < 2:
        raise ValueError(
            "piecewise_median_period_weeks must be at greater than 2 weeks")

    # if the data is daily, then we need to bump the period to weekly to get multiple examples
    gran = granularity
    gran_period = {'ms': 60000, 'sec': 3600, 'min': 1440, 'hr': 24, 'day': 7}
    period = gran_period.get(gran)
    if not period:
        raise ValueError(
            '%s granularity detected. This is currently not supported.' %
            (gran, ))

    # now convert the timestamp column into a proper timestamp
    df['timestamp'] = df['timestamp'].map(
        lambda x: datetime.datetime.utcfromtimestamp(x))

    num_obs = len(df.value)

    clamp = (1 / float(num_obs))
    if max_anoms < clamp:
        max_anoms = clamp

    if longterm:
        if gran == "day":
            num_obs_in_period = period * piecewise_median_period_weeks + 1
            num_days_in_period = 7 * piecewise_median_period_weeks + 1
        else:
            num_obs_in_period = period * 7 * piecewise_median_period_weeks
            num_days_in_period = 7 * piecewise_median_period_weeks

        last_date = df.timestamp.iget(-1)

        all_data = []

        for j in range(0, len(df.timestamp), num_obs_in_period):
            start_date = df.timestamp.iget(j)
            end_date = min(
                start_date + datetime.timedelta(days=num_obs_in_period),
                df.timestamp.iget(-1))

            # if there is at least 14 days left, subset it, otherwise subset last_date - 14days
            if (end_date - start_date).days == num_days_in_period:
                sub_df = df[(df.timestamp >= start_date)
                            & (df.timestamp < end_date)]
            else:
                sub_df = df[(df.timestamp > (
                    last_date - datetime.timedelta(days=num_days_in_period)))
                            & (df.timestamp <= last_date)]
            all_data.append(sub_df)
    else:
        all_data = [df]

    all_anoms = DataFrame(columns=['timestamp', 'value'])
    seasonal_plus_trend = DataFrame(columns=['timestamp', 'value'])

    # Detect anomalies on all data (either entire data in one-pass, or in 2 week blocks if longterm=TRUE)
    for i in range(len(all_data)):
        directions = {
            'pos': Direction(True, True),
            'neg': Direction(True, False),
            'both': Direction(False, True)
        }
        anomaly_direction = directions[direction]

        # detect_anoms actually performs the anomaly detection and returns the result in a list containing the anomalies
        # as well as the decomposed components of the time series for further analysis.

        s_h_esd_timestamps = detect_anoms(
            all_data[i],
            k=max_anoms,
            alpha=alpha,
            num_obs_per_period=period,
            use_decomp=True,
            one_tail=anomaly_direction.one_tail,
            upper_tail=anomaly_direction.upper_tail,
            verbose=verbose)
        if s_h_esd_timestamps is None:
            return {'anoms': DataFrame(columns=["timestamp", "anoms"])}

        # store decomposed comps in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps
        data_decomp = s_h_esd_timestamps['stl']
        s_h_esd_timestamps = s_h_esd_timestamps['anoms']

        # -- Step 3: Use detected anomaly timestamps to extract the actual anomalies (timestamp and value) from the data
        if s_h_esd_timestamps:
            anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)]
        else:
            anoms = DataFrame(columns=['timestamp', 'value'])

        # Filter the anomalies using one of the thresholding functions if applicable
        if threshold:
            # Calculate daily max values
            periodic_maxes = df.groupby(df.timestamp.map(
                Timestamp.date)).aggregate(np.max).value

            # Calculate the threshold set by the user
            thresh = 0.5
            if threshold == 'med_max':
                thresh = periodic_maxes.median()
            elif threshold == 'p95':
                thresh = periodic_maxes.quantile(.95)
            elif threshold == 'p99':
                thresh = periodic_maxes.quantile(.99)

            # Remove any anoms below the threshold
            anoms = anoms[anoms.value >= thresh]

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(data_decomp)

    # Cleanup potential duplicates
    try:
        all_anoms.drop_duplicates(subset=['timestamp'])
        seasonal_plus_trend.drop_duplicates(subset=['timestamp'])
    except TypeError:
        all_anoms.drop_duplicates(cols=['timestamp'])
        seasonal_plus_trend.drop_duplicates(cols=['timestamp'])

    # Calculate number of anomalies as a percentage
    anom_pct = (len(df.value) / float(num_obs)) * 100

    # name the columns back
    df.rename(columns={
        "timestamp": orig_header[0],
        "value": orig_header[1]
    },
              inplace=True)

    if anom_pct == 0:
        return {"anoms": None}

    all_anoms.index = all_anoms.timestamp

    if e_value:
        d = {
            'timestamp':
            all_anoms.timestamp,
            'anoms':
            all_anoms.value,
            'expected_value':
            seasonal_plus_trend[seasonal_plus_trend.timestamp.isin(
                all_anoms.timestamp)].value
        }
    else:
        d = {'timestamp': all_anoms.timestamp, 'anoms': all_anoms.value}

    anoms = DataFrame(d, index=d['timestamp'].index)

    # convert timestamps back to unix time
    anoms['timestamp'] = anoms['timestamp'].astype(np.int64)
    anoms['timestamp'] = anoms['timestamp'].map(lambda x: x * 10e-10)

    return {'anoms': anoms}
def get_different_coef_by_class(data_all, type_name, ds_list):

    # 定义参考的维度列,后期用户价值需要根据此计算
    cols = [
        'ora_source', 'ora_channel', 'ora_activity_type', 'ora_parent_channel',
        'ora_channel_type', 'ora_business_type'
    ]
    xishu_table = []
    colList = get_descending_list_order(cols)
    # colList = [['ora_business_type']]
    # print colList
    for col in colList:

        # print col
        data = data_all[data_all['type'] == type_name].groupby(col +
                                                               ['ds']).sum()
        data = data.reset_index()
        data = add_columns(data)

        # #测试
        # if 'ora_channel' not in col:
        #     continue
        # print data.head(1)

        for index1, row in data[(data['ds'] >= ds_list['ds_0'])
                                & (data['ds'] <= ds_list['ds_1']
                                   )][col].drop_duplicates().iterrows():
            # print row
            contition = gen_condition_str_by_series(row).decode('utf8')
            # print contition
            # print a

            head_col = collections.OrderedDict()

            head_col = collections.OrderedDict.fromkeys(cols)
            for i in head_col:
                if i in col:
                    head_col[i] = row[i]
                    # data[i] = data[i].apply(lambda a: str(a).decode('utf8'))
                else:
                    head_col[i] = '-'
            # print head_col
            data1 = data[eval(contition)]
            data1 = delete_exception_value(data1)
            if data1.empty:
                continue
            if type_name in [
                    'e_supplier_profit', 'common_order_profit',
                    'tao2_order_profit', 'zy_order_profit'
            ]:
                coef_dict = get_coef_of_e_supplier(data1, ds_list)
            if type_name in [
                    'ad_profit', 'ad_common_profit', 'ad_licai_profit',
                    'licai_order_profit'
            ]:
                coef_dict = get_coef_of_ad_or_licai(data1, ds_list)
            # print coef_dict

            coef_dict = collections.OrderedDict(head_col.items() +
                                                coef_dict.items())
            xishu_table.append(coef_dict)
            # print xishu_table
            # break

    df = DataFrame(xishu_table)
    df = df.drop_duplicates()
    return df
Exemple #49
0
# # print(df2)
#
# # df3 = pd.merge(df1, df2, on="name")
# df3 = pd.merge(df1, df2, how='outer' )
# print(df3)


# from pandasql import sqldf, load_births, load_meat
# df1 = DataFrame({'name':['ZhangFei', 'GuanYu', 'a', 'b', 'c'], 'data1':range(5)})
# pysqldf = lambda sql: sqldf(sql, globals())
# sql = "select * from df1 where name ='ZhangFei'"
# print(pysqldf(sql))
def sum(df: DataFrame):
    df['总分'] = df["中文"] + df["英语"] + df["数学"]
    return df


df = DataFrame(pd.read_csv("data.txt"))

df.rename(columns={
    "Chinese": "中文",
    "English": "英语",
    "Math": "数学"
},
          inplace=True)
df = df.drop_duplicates()
print(df)
df = df.apply(sum, axis=1)
print("--------")
print(df)
## subset duplicated values
df3.duplicated(subset=['A', 'B'])
df3.duplicated(['A', 'B'])

## HOW to get all values that have duplicates (purging)
t_b = df3.duplicated()
b_t = df3.duplicated(take_last=True)
unique = ~(t_b | b_t)  # complement where either is true
unique = ~t_b & ~b_t
unique

df3[unique]

# DROPPING DUPLICATES --------------------------------------------
df3.drop_duplicates()
df3.drop_duplicates(take_last=True)

## this is the same as
t_b = df3.duplicated()
df3[~t_b]
df3.drop_duplicates() == df3[~t_b]

## subset criteria
df3.drop_duplicates(['A', 'B'])

# WHEN TO USE ----------------------------------------------------------

## if you want to keep the first duplicated value (from top) and remove others
df3.drop_duplicates()
import pandas as pd
from pandas import DataFrame

df = DataFrame({
    'col1':['uber', 'uber', 'uber', 'uber', 'grab'],
    'col2':[5,4,3,3,5]
})

print df
print df.duplicated()

#drp duplicate
print df.drop_duplicates()

#drop column
print df.drop_duplicates(['col1'])

#
print df.drop_duplicates(['col1'],keep='last')
def load_img_metadata(meta_df: pd.DataFrame, imgs_df: pd.DataFrame):
    imgs_df['id'] = imgs_df['id'].astype(int)
    imgs_df.drop_duplicates(subset=['cover url'], inplace=True)
    ans = meta_df.merge(imgs_df, left_on='imdb_id', right_on='id', how='inner')
    return ans
def IBTrading(app):
    #SetupLogger() Already setup in the main script

    app.logger_report.warning("Algorithm checktime: %s",
                              datetime.datetime.now())
    time.sleep(ib_api_include.sleeptime_trading)

    # Get the open position for the account. Through flex query delayed data
    #openPosition = GetOpenPosition()

    reqId = 1
    # Reteieve the market data and news for the open options.

    # Get the open position from the portfolio
    portfolio = app.reqPortfolio()
    portfSummary = DataFrame(portfolio,
                             columns=[
                                 'symbol', 'secType', 'exchange', 'currency',
                                 'lastTradeDateOrContractMonth', 'multiplier',
                                 'right', 'strike', 'position', 'marketPrice',
                                 'marketValue', 'averageCost', 'unrealizedPNL',
                                 'realizedPNL', 'accountName'
                             ])
    portfSummary = portfSummary.drop_duplicates(subset=[
        'symbol', 'secType', 'exchange', 'currency',
        'lastTradeDateOrContractMonth', 'multiplier', 'right', 'strike'
    ],
                                                keep='first')
    print(portfSummary)

    reqArraySize = ib_api_include.reqArraySize
    reqIdList = list(range(1,
                           reqArraySize * len(portfSummary) +
                           1))  # Not hard coded, replace 4 with REQ_ARRAY_SIZE

    symList = []
    secTypeList = []
    app.logger_report.warning("Portfolio:{}".format(portfSummary))

    for i in range(0, len(portfSummary)):
        item = portfSummary.iloc[i]
        symList += [item['symbol']] * reqArraySize
        secTypeList += [item['secType']] * reqArraySize
    reqIdSymList = DataFrame({
        "reqId": reqIdList,
        "symbol": symList,
        'secType': secTypeList
    })

    marketReturn = app.GetMarketDataNews(portfSummary, reqId, reqArraySize)
    # Sheet names, profile_stock_threshold, profile_opt_threshold, watchlist_stock_treshold, watchlist_opt_threshold
    profile_stock_threshold = pd.read_excel(
        ib_api_include.thresholdFileName, sheet_name='profile_stock_threshold')
    option_profit_threshold = pd.read_excel(ib_api_include.thresholdFileName,
                                            sheet_name='profile_opt_threshold')
    watchlist_stock_treshold = pd.read_excel(
        ib_api_include.thresholdFileName,
        sheet_name='watchlist_stock_treshold')
    watchlist_stock_treshold = DataFrame(watchlist_stock_treshold)
    tickTypeIdMapping = pd.read_csv(ib_api_include.tickTypeIdMapping,
                                    index_col=None)
    tickTypeIdMapping["TickId"] = tickTypeIdMapping["TickId"].astype(int)
    marketPrice = DataFrame(
        marketReturn[0],
        columns=["reqId", "tickTypeId", "price", "executeNow"])
    marketSize = DataFrame(marketReturn[1],
                           columns=["reqId", "tickTypeId", "volume"])
    optionInfo = DataFrame(marketReturn[2],
                           columns=[
                               "reqId", "tickTypeId", "optionInfo", "delta",
                               "optPrice", "pvDividend", "gamma", "vega",
                               "theta", "undPrice"
                           ])
    #Check if it is live trading
    if datetime.datetime.now() >= datetime.datetime.strptime(str(datetime.datetime.now().date()) + ' 15:55:00','%Y-%m-%d %H:%M:%S') \
            and datetime.datetime.now() < datetime.datetime.strptime(str(datetime.datetime.now().date()) + ' 09:30:00','%Y-%m-%d %H:%M:%S'):
        print("Trading not alive!")
        return None
    news = DataFrame(marketReturn[3],
                     columns=[
                         "tickerId", "timeStamp", "providerCode", "articleId",
                         "headline", "effect"
                     ])
    portfSummary = marketReturn[4]
    marketPrice = pd.merge(marketPrice,
                           tickTypeIdMapping,
                           left_on="tickTypeId",
                           right_on="TickId",
                           how="left")
    marketSize = pd.merge(marketSize,
                          tickTypeIdMapping,
                          left_on="tickTypeId",
                          right_on="TickId",
                          how="left")
    optionInfo = pd.merge(optionInfo,
                          tickTypeIdMapping,
                          left_on="tickTypeId",
                          right_on="TickId",
                          how="left")
    print("Here is the portfolio")
    print(portfSummary)
    marketPrice = pd.merge(marketPrice, reqIdSymList, on="reqId", how="left")
    marketSize = pd.merge(marketSize, reqIdSymList, on="reqId", how="left")
    optionInfo = pd.merge(optionInfo, reqIdSymList, on="reqId", how="left")
    app.logger_report.warning("marketPrice:{}".format(marketPrice))
    app.logger_report.warning("marketSize:{}".format(marketSize))
    app.logger_report.warning("optionInfo:{}".format(optionInfo))
    news = pd.merge(news,
                    reqIdSymList,
                    left_on="tickerId",
                    right_on="reqId",
                    how="left")

    # Use AI model to build up the model for the option or the stock.
    my_ib_trading = open(ib_api_include.generatedFileName, 'r')
    tradingFile = json.load(my_ib_trading)
    my_ib_trading.close()
    tradingFile["ready_flag"] = True
    tradingFile["previous_time"] = str(datetime.datetime.now() -
                                       datetime.timedelta(minutes=1))[0:-7]
    tradingFile["updated_time"] = str(datetime.datetime.now())[0:-7]
    tradingFile['transactions'] = []
    print(portfSummary)
    commissionFee = ib_api_include.commissionFee

    for index, item in portfSummary.iterrows():
        if item['secType'] == 'STK':
            Stock_Algo = ib_trading_algo_oop.Stock_Algo(
                profile_stock_threshold, item, tradingFile)
            tradingFile = Stock_Algo.tradingStock()
    for index, item in portfSummary.iterrows():
        if item['secType'] == 'OPT':
            # Get the option information
            infoIndivOption = app.GetInfoIndivOption(optionInfo, item)
            print(infoIndivOption)
            if infoIndivOption != 'Yes':
                print(infoIndivOption)
                optPrice = infoIndivOption[0]
                undPrice = infoIndivOption[1]
                gamma = infoIndivOption[2]
                delta = infoIndivOption[3]
                theta = infoIndivOption[4]
                avgCost = infoIndivOption[5]
                multiplier = infoIndivOption[6]
                Option_Algo = ib_trading_algo_oop.Option_Algo(
                    option_profit_threshold, optPrice, undPrice, gamma, delta,
                    theta, avgCost, multiplier, item, tradingFile,
                    commissionFee)

                # Merge the option in portfolio with the option price, size and the option Greek values.
                if item['position'] > 0 and item['right'] == 'P':
                    tradingFile = Option_Algo.optionBuyPutAlgo()
                elif item['position'] < 0 and item['right'] == 'P':
                    tradingFile = Option_Algo.optionSellPutAlgo()
                elif item['position'] > 0 and item['right'] == 'C':
                    tradingFile = Option_Algo.optionBuyCallAlgo()
                elif item['position'] < 0 and item['right'] == 'C':
                    tradingFile = Option_Algo.optionSellCallAlgo()

            elif infoIndivOption == 'Yes':
                pass

    app.logger_report.warning(tradingFile)
    writeExeFile = open(ib_api_include.generatedFileName, 'w+')
    writeExeFile.write(json.dumps(tradingFile, indent=4))
    writeExeFile.close()
Exemple #54
0
def test_drop_duplicates_NA():
    # none
    df = DataFrame({'A': [None, None, 'foo', 'bar',
                          'foo', 'bar', 'bar', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates('A')
    expected = df.loc[[0, 2, 3]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('A', keep='last')
    expected = df.loc[[1, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('A', keep=False)
    expected = df.loc[[]]  # empty df
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    result = df.drop_duplicates(['A', 'B'])
    expected = df.loc[[0, 2, 3, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['A', 'B'], keep='last')
    expected = df.loc[[1, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['A', 'B'], keep=False)
    expected = df.loc[[6]]
    tm.assert_frame_equal(result, expected)

    # nan
    df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'bar', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates('C')
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('C', keep='last')
    expected = df.loc[[3, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('C', keep=False)
    expected = df.loc[[]]  # empty df
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    result = df.drop_duplicates(['C', 'B'])
    expected = df.loc[[0, 1, 2, 4]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['C', 'B'], keep='last')
    expected = df.loc[[1, 3, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['C', 'B'], keep=False)
    expected = df.loc[[1]]
    tm.assert_frame_equal(result, expected)
Exemple #55
0
def detect_ts(df,
              max_anoms=0.10,
              direction='pos',
              alpha=0.05,
              only_last=None,
              threshold=None,
              e_value=False,
              longterm=False,
              piecewise_median_period=3,
              plot=False,
              y_log=False,
              xlabel='',
              ylabel='count',
              title=None,
              verbose=False,
              custom_period=None,
              use_period=True):
    """
    Anomaly Detection Using Seasonal Hybrid ESD Test
    A technique for detecting anomalies in seasonal univariate time series where the input is a
    series of <timestamp, value> pairs.

    Args:

    x: Time series as a two column data frame where the first column consists of the
    timestamps and the second column consists of the observations.

    max_anoms: Maximum number of anomalies that S-H-ESD will detect as a percentage of the
    data.

    direction: Directionality of the anomalies to be detected. Options are: ('pos' | 'neg' | 'both').

    alpha: The level of statistical significance with which to accept or reject anomalies.

    only_last: Find and report anomalies only within the last day or hr in the time series. Options: (None | 'day' | 'hr')

    threshold: Only report positive going anoms above the threshold specified. Options are: (None | 'med_max' | 'p95' | 'p99')

    e_value: Add an additional column to the anoms output containing the expected value.

    longterm: Increase anom detection efficacy for time series that are greater than a month.

    See Details below.
    piecewise_median_period_weeks: The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal (2014). Defaults to 2.

    plot: (Currently unsupported) A flag indicating if a plot with both the time series and the estimated anoms,
    indicated by circles, should also be returned.

    y_log: Apply log scaling to the y-axis. This helps with viewing plots that have extremely
    large positive anomalies relative to the rest of the data.

    xlabel: X-axis label to be added to the output plot.
    ylabel: Y-axis label to be added to the output plot.

    Details


    'longterm' This option should be set when the input time series is longer than a month.
    The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014).
    'threshold' Filter all negative anomalies and those anomalies whose magnitude is smaller
    than one of the specified thresholds which include: the median
    of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the
    99th percentile of the daily max values (p99).
    'title' Title for the output plot.
    'verbose' Enable debug messages

    The returned value is a dictionary with the following components:
      anoms: Data frame containing timestamps, values, and optionally expected values.
      plot: A graphical object if plotting was requested by the user. The plot contains
      the estimated anomalies annotated on the input time series
    """

    if not isinstance(df, DataFrame):
        raise ValueError("data must be a single data frame.")
    else:
        if len(df.columns) != 2 or not df.iloc[:, 1].map(np.isreal).all():
            raise ValueError(("data must be a 2 column data.frame, with the"
                              "first column being a set of timestamps, and "
                              "the second coloumn being numeric values."))

        if (not (df.dtypes[0].type is np.datetime64)
                and not (df.dtypes[0].type is np.int64)):
            df = format_timestamp(df)

    if list(df.columns.values) != ["timestamp", "value"]:
        df.columns = ["timestamp", "value"]

    # Sanity check all input parameters
    if max_anoms > 0.49:
        length = len(df.value)
        raise ValueError(("max_anoms must be less than 50% of "
                          "the data points (max_anoms =%f data_points =%s).") %
                         (round(max_anoms * length, 0), length))

    if not direction in ['pos', 'neg', 'both']:
        raise ValueError("direction options are: pos | neg | both.")

    if not (0.01 <= alpha or alpha <= 0.1):
        if verbose:
            import warnings
            warnings.warn(("alpha is the statistical signifigance, "
                           "and is usually between 0.01 and 0.1"))

    if only_last and not only_last in ['day', 'hr']:
        raise ValueError("only_last must be either 'day' or 'hr'")

    if not threshold in [None, 'med_max', 'p95', 'p99']:
        raise ValueError("threshold options are: None | med_max | p95 | p99")

    if not isinstance(e_value, bool):
        raise ValueError("e_value must be a boolean")

    if not isinstance(longterm, bool):
        raise ValueError("longterm must be a boolean")

    if piecewise_median_period < 3:
        raise ValueError(
            "piecewise_median_period must be at greater than 2 periods")

    if not isinstance(plot, bool):
        raise ValueError("plot must be a boolean")

    if not isinstance(y_log, bool):
        raise ValueError("y_log must be a boolean")

    if not isinstance(xlabel, basestring):
        raise ValueError("xlabel must be a string")

    if not isinstance(ylabel, basestring):
        raise ValueError("ylabel must be a string")

    if title and not isinstance(title, basestring):
        raise ValueError("title must be a string")

    if not title:
        title = ''
    else:
        title = title + " : "

    gran = get_gran(df)

    if gran == "day":
        num_days_per_line = 7
        if isinstance(only_last, basestring) and only_last == 'hr':
            only_last = 'day'
    else:
        num_days_per_line = 1

    if gran == 'sec':
        df.timestamp = date_format(df.timestamp, "%Y-%m-%d %H:%M:00")
        df = format_timestamp(df.groupby('timestamp').aggregate(np.sum))

    # if the data is daily, then we need to bump
    # the period to weekly to get multiple examples
    gran_period = {'min': 1440, 'hr': 24, 'day': 7}
    if (custom_period == None):
        print "auto choose period"
        period = gran_period.get(gran)
    else:
        period = custom_period
    if not period:
        raise ValueError(
            '%s granularity detected. This is currently not supported.' % gran)
    num_obs = len(df.value)

    clamp = (1 / float(num_obs))
    if max_anoms < clamp:
        max_anoms = clamp

    if longterm:
        if gran == "day":
            num_obs_in_period = period * piecewise_median_period + 1
        #     num_days_in_period = 7 * piecewise_median_period + 1
        else:
            num_obs_in_period = period * piecewise_median_period
            # num_days_in_period =  piecewise_median_period # dong nay can phai sua lai neu period ko phai la ngay

        last_date = df.timestamp.iget(-1)

        all_data = []

        for j in range(0, len(df.timestamp), num_obs_in_period):
            start_date = df.timestamp.iget(j)

            end_date = min(
                start_date + datetime.timedelta(minutes=num_obs_in_period),
                df.timestamp.iget(-1))

            # if there is at least 14 days left, subset it,
            # otherwise subset last_date - 14days
            if ((int)((end_date -
                       start_date).total_seconds())) == num_obs_in_period * 60:
                sub_df = df[(df.timestamp >= start_date)
                            & (df.timestamp < end_date)]
            else:
                sub_df = df[(df.timestamp > (
                    last_date - datetime.timedelta(minutes=num_obs_in_period)))
                            & (df.timestamp <= last_date)]
            all_data.append(sub_df)
    else:
        all_data = [df]

    all_anoms = DataFrame(columns=['timestamp', 'value'])
    seasonal_plus_trend = DataFrame(columns=['timestamp', 'value'])

    # Detect anomalies on all data (either entire data in one-pass,
    # or in 2 week blocks if longterm=TRUE)
    for i in range(len(all_data)):
        directions = {
            'pos': Direction(True, True),
            'neg': Direction(True, False),
            'both': Direction(False, True)
        }
        anomaly_direction = directions[direction]

        # detect_anoms actually performs the anomaly detection and
        # returns the results in a list containing the anomalies
        # as well as the decomposed components of the time series
        # for further analysis.

        s_h_esd_timestamps = detect_anoms(
            all_data[i],
            k=max_anoms,
            alpha=alpha,
            num_obs_per_period=period,
            use_decomp=True,
            one_tail=anomaly_direction.one_tail,
            upper_tail=anomaly_direction.upper_tail,
            verbose=verbose,
            use_period=use_period)

        # store decomposed components in local variable and overwrite
        # s_h_esd_timestamps to contain only the anom timestamps
        data_decomp = s_h_esd_timestamps['stl']
        s_h_esd_timestamps = s_h_esd_timestamps['anoms']

        # -- Step 3: Use detected anomaly timestamps to extract the actual
        # anomalies (timestamp and value) from the data
        if s_h_esd_timestamps:
            anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)]
        else:
            anoms = DataFrame(columns=['timestamp', 'value'])

        # Filter the anomalies using one of the thresholding functions if applicable
        if threshold:
            # Calculate daily max values
            periodic_maxes = df.groupby(df.timestamp.map(
                Timestamp.date)).aggregate(np.max).value

            # Calculate the threshold set by the user
            if threshold == 'med_max':
                thresh = periodic_maxes.median()
            elif threshold == 'p95':
                thresh = periodic_maxes.quantile(.95)
            elif threshold == 'p99':
                thresh = periodic_maxes.quantile(.99)

            # Remove any anoms below the threshold
            anoms = anoms[anoms.value >= thresh]

        all_anoms = all_anoms.append(anoms)
        seasonal_plus_trend = seasonal_plus_trend.append(data_decomp)

    # Cleanup potential duplicates
    try:
        all_anoms.drop_duplicates(subset=['timestamp'])
        seasonal_plus_trend.drop_duplicates(subset=['timestamp'])
    except TypeError:
        all_anoms.drop_duplicates(cols=['timestamp'])
        seasonal_plus_trend.drop_duplicates(cols=['timestamp'])

    # -- If only_last was set by the user,
    # create subset of the data that represent the most recent day
    if only_last:
        start_date = df.timestamp.iget(-1) - datetime.timedelta(days=7)
        start_anoms = df.timestamp.iget(-1) - datetime.timedelta(days=1)
        if gran is "day":
            breaks = 3 * 12
            num_days_per_line = 7
        else:
            if only_last == 'day':
                breaks = 12
            else:
                start_date = df.timestamp.iget(-1) - datetime.timedelta(days=2)
                # truncate to days
                start_date = datetime.date(start_date.year, start_date.month,
                                           start_date.day)
                start_anoms = (df.timestamp.iget(-1) -
                               datetime.timedelta(hours=1))
                breaks = 3

        # subset the last days worth of data
        x_subset_single_day = df[df.timestamp > start_anoms]
        # When plotting anoms for the last day only
        # we only show the previous weeks data
        x_subset_week = df[(df.timestamp <= start_anoms)
                           & (df.timestamp > start_date)]
        if len(all_anoms) > 0:
            all_anoms = all_anoms[
                all_anoms.timestamp >= x_subset_single_day.timestamp.iget(0)]
        num_obs = len(x_subset_single_day.value)

    # Calculate number of anomalies as a percentage
    anom_pct = (len(df.value) / float(num_obs)) * 100

    if anom_pct == 0:
        return {"anoms": None, "plot": None}

    # The original R implementation handles plotting here.
    # Plotting is currently not implemented in this version.
    # if plot:
    #     plot_something()

    all_anoms.index = all_anoms.timestamp

    if e_value:
        d = {
            'timestamp':
            all_anoms.timestamp,
            'anoms':
            all_anoms.value,
            'expected_value':
            seasonal_plus_trend[seasonal_plus_trend.timestamp.isin(
                all_anoms.timestamp)].value
        }
    else:
        d = {'timestamp': all_anoms.timestamp, 'anoms': all_anoms.value}
    anoms = DataFrame(d, index=d['timestamp'].index)

    return {'anoms': anoms, 'plot': None}
Exemple #56
0
import pandas as pd
from pandas import DataFrame

ReadCsv = pd.read_csv(r'gloss_entryID_synonyms.csv', sep=';', header='infer')
df_glosses = DataFrame(ReadCsv)
df_glosses.head()

new_df_glosses = df_glosses.drop_duplicates()
df_glosses.to_csv('gloss_entryID_synonyms.csv', sep=',', index=False)
def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"],
                       ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all), df['a'])
    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels,
                               categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels,
                           ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(exp_cats, sort=False,
                                observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
Exemple #58
0
def famille(temporary_store=None, year=None):
    assert temporary_store is not None
    assert year is not None

    log.info('step_04_famille: construction de la table famille')

    # On suit la méthode décrite dans le Guide ERF_2002_rétropolée page 135
    # TODO: extraire ces valeurs d'un fichier de paramètres de législation
    if year == 2006:
        smic = 1254
    elif year == 2007:
        smic = 1280
    elif year == 2008:
        smic = 1308
    elif year == 2009:
        smic = 1337
    else:
        log.info("smic non défini")

    # TODO check if we can remove acteu forter etc since dealt with in 01_pre_proc

    log.info('Etape 1 : préparation de base')
    log.info('    1.1 : récupération de indivi')
    indivi = temporary_store['indivim_{}'.format(year)]

    indivi['year'] = year
    indivi["noidec"] = indivi["declar1"].str[0:2].copy(
    )  # Not converted to int because some NaN are present
    indivi["agepf"] = (
        (indivi.naim < 7) * (indivi.year - indivi.naia) + (indivi.naim >= 7) *
        (indivi.year - indivi.naia - 1)).astype(
            object
        )  # TODO: naia has some NaN but naim do not and then should be an int

    indivi = indivi[~((indivi.lien == 6) & (indivi.agepf < 16) &
                      (indivi.quelfic == "EE"))].copy()

    assert_dtype(indivi.year, "int64")
    for series_name in ['agepf', 'noidec']:  # integer with NaN
        assert_dtype(indivi[series_name], "object")

    log.info('    1.2 : récupération des enfants à naître')
    individual_variables = [
        'acteu',
        'actrec',
        'agepf',
        'agepr',
        'cohab',
        'contra',
        'declar1',
        'forter',
        'ident',
        'lien',
        'lpr',
        'mrec',
        'naia',
        'naim',
        'noi',
        'noicon',
        'noidec',
        'noimer',
        'noindiv',
        'noiper',
        'persfip',
        'quelfic',
        'retrai',
        'rga',
        'rstg',
        'sexe',
        'stc',
        'titc',
        'year',
        'ztsai',
    ]
    enfants_a_naitre = temporary_store['enfants_a_naitre_{}'.format(
        year)][individual_variables].copy()
    enfants_a_naitre.drop_duplicates('noindiv', inplace=True)
    log.info(u""""
    Il y a {} enfants à naitre avant de retirer ceux qui ne sont pas enfants
    de la personne de référence
    """.format(len(enfants_a_naitre.index)))
    enfants_a_naitre = enfants_a_naitre[enfants_a_naitre.lpr == 3].copy()
    enfants_a_naitre = enfants_a_naitre[~(
        enfants_a_naitre.noindiv.isin(indivi.noindiv.values))].copy()
    log.info(u""""
    Il y a {} enfants à naitre après avoir retiré ceux qui ne sont pas enfants
    de la personne de référence
    """.format(len(enfants_a_naitre.index)))
    # PB with vars "agepf"  "noidec" "year"  NOTE: quels problèmes ? JS
    log.info(u"    1.3 : création de la base complète")
    base = concat([indivi, enfants_a_naitre])
    log.info(u"base contient {} lignes ".format(len(base.index)))
    base['noindiv'] = (100 * base.ident + base['noi']).astype(int)
    base['moins_de_15_ans_inclus'] = base.agepf < 16
    base['jeune_non_eligible_rsa'] = (base.agepf >= 16) & (base.agepf <
                                                           AGE_RSA)
    base['jeune_eligible_rsa'] = base.agepf >= AGE_RSA
    base['ztsai'].fillna(0, inplace=True)
    base['smic55'] = base['ztsai'] >= (smic * 12 * 0.55
                                       )  # 55% du smic mensuel brut
    base['famille'] = 0
    base['kid'] = False
    for series_name in [
            'kid', 'moins_de_15_ans_inclus', 'jeune_non_eligible_rsa',
            'jeune_eligible_rsa', 'smic55'
    ]:
        assert_dtype(base[series_name], "bool")
    assert_dtype(base.famille, "int")
    # TODO: remove or clean from NA assert_dtype(base.ztsai, "int")

    log.info(u"Etape 2 : On cherche les enfants ayant père et/ou mère")
    personne_de_reference = base[['ident', 'noi']][base.lpr == 1].copy()
    personne_de_reference['noifam'] = (
        100 * personne_de_reference.ident +
        personne_de_reference['noi']).astype(int)
    personne_de_reference = personne_de_reference[['ident', 'noifam']].copy()
    log.info(u"length personne_de_reference : {}".format(
        len(personne_de_reference.index)))
    nof01 = base[(base.lpr.isin([1, 2])) | ((base.lpr == 3) &
                                            (base.moins_de_15_ans_inclus)) |
                 ((base.lpr == 3) & (base.jeune_non_eligible_rsa) &
                  (~base.smic55))].copy()
    log.info('longueur de nof01 avant merge : {}'.format(len(nof01.index)))
    nof01 = nof01.merge(personne_de_reference, on='ident', how='outer')
    nof01['famille'] = 10
    nof01['kid'] = ((nof01.lpr == 3) & (nof01.moins_de_15_ans_inclus)) | (
        (nof01.lpr == 3) & (nof01.jeune_non_eligible_rsa) & ~(nof01.smic55))
    for series_name in ['famille', 'noifam']:
        assert_dtype(nof01[series_name], "int")
    assert_dtype(nof01.kid, "bool")
    famille = nof01.copy()
    del nof01
    control_04(famille, base)

    log.info(u"    2.1 : identification des couples")
    # l'ID est le noi de l'homme

    hcouple = subset_base(base, famille)
    hcouple = hcouple[(hcouple.cohab == 1) & (hcouple.lpr >= 3) &
                      (hcouple.sexe == 1)].copy()
    hcouple['noifam'] = (100 * hcouple.ident + hcouple.noi).astype(int)
    hcouple['famille'] = 21
    for series_name in ['famille', 'noifam']:
        assert_dtype(hcouple[series_name], "int")
    log.info(u"longueur hcouple : ".format(len(hcouple.index)))

    log.info(u"    2.2 : attributing the noifam to the wives")
    fcouple = base[~(base.noindiv.isin(famille.noindiv.values))].copy()
    fcouple = fcouple[(fcouple.cohab == 1) & (fcouple.lpr >= 3) &
                      (fcouple.sexe == 2)].copy()
    # l'identifiant de la famille est celui du conjoint de la personne de référence du ménage
    fcouple['noifam'] = (100 * fcouple.ident + fcouple.noicon).astype(int)
    fcouple['famille'] = 22
    for series_name in ['famille', 'noifam']:
        assert_dtype(fcouple[series_name], "int")
    log.info(u"Il y a {} enfants avec parents en fcouple".format(
        len(fcouple.index)))

    famcom = fcouple.merge(hcouple, on='noifam', how='outer')
    log.info(u"longueur fancom après fusion : {}".format(len(famcom.index)))
    fcouple = fcouple.merge(
        famcom
    )  # TODO : check s'il ne faut pas faire un inner merge sinon présence de doublons
    log.info(u"longueur fcouple après fusion : {}".format(len(fcouple.index)))
    famille = concat([famille, hcouple, fcouple], join='inner')
    control_04(famille, base)

    log.info(u"Etape 3: Récupération des personnes seules")
    log.info(u"    3.1 : personnes seules de catégorie 1")
    seul1 = base[~(base.noindiv.isin(famille.noindiv.values))].copy()
    seul1 = seul1[(seul1.lpr.isin([3, 4]))
                  & ((seul1.jeune_non_eligible_rsa & seul1.smic55)
                     | seul1.jeune_eligible_rsa) & (seul1.cohab == 1) &
                  (seul1.sexe == 2)].copy()
    if len(seul1.index) > 0:
        seul1['noifam'] = (100 * seul1.ident + seul1.noi).astype(int)
        seul1['famille'] = 31
        for series_name in ['famille', 'noifam']:
            assert_dtype(seul1[series_name], "int")
        famille = concat([famille, seul1])
    control_04(famille, base)

    log.info(u"    3.1 personnes seules de catégorie 2")
    seul2 = base[~(base.noindiv.isin(famille.noindiv.values))].copy()
    seul2 = seul2[(seul2.lpr.isin([3, 4])) & seul2.jeune_non_eligible_rsa
                  & seul2.smic55 & (seul2.cohab != 1)].copy()
    seul2['noifam'] = (100 * seul2.ident + seul2.noi).astype(int)
    seul2['famille'] = 32
    for series_name in ['famille', 'noifam']:
        assert_dtype(seul2[series_name], "int")
    famille = concat([famille, seul2])
    control_04(famille, base)

    log.info(u"    3.3 personnes seules de catégorie 3")
    seul3 = subset_base(base, famille)
    seul3 = seul3[(seul3.lpr.isin([3, 4])) & seul3.jeune_eligible_rsa &
                  (seul3.cohab != 1)].copy()
    # TODO: CHECK erreur dans le guide méthodologique ERF 2002 lpr 3,4 au lieu de 3 seulement
    seul3['noifam'] = (100 * seul3.ident + seul3.noi).astype(int)
    seul3['famille'] = 33
    for series_name in ['famille', 'noifam']:
        assert_dtype(seul3[series_name], "int")
    famille = concat([famille, seul3])
    control_04(famille, base)

    log.info(u"    3.4 : personnes seules de catégorie 4")
    seul4 = subset_base(base, famille)
    assert seul4.noimer.notnull().all()
    seul4 = seul4[(seul4.lpr == 4) & seul4.jeune_non_eligible_rsa
                  & ~(seul4.smic55) & (seul4.noimer == 0) &
                  (seul4.persfip == 'vous')].copy()
    if len(seul4.index) > 0:
        seul4['noifam'] = (100 * seul4.ident + seul4.noi).astype(int)
        seul4['famille'] = 34
        famille = concat([famille, seul4])
        for series_name in ['famille', 'noifam']:
            assert_dtype(seul4[series_name], "int")
    control_04(famille, base)

    log.info(u"Etape 4 : traitement des enfants")
    log.info(u"    4.1 : enfant avec mère")
    avec_mere = subset_base(base, famille)
    avec_mere = avec_mere[((avec_mere.lpr == 4) &
                           ((avec_mere.jeune_non_eligible_rsa == 1) |
                            (avec_mere.moins_de_15_ans_inclus == 1)) &
                           (avec_mere.noimer > 0))].copy()
    avec_mere['noifam'] = (100 * avec_mere.ident +
                           avec_mere.noimer).astype(int)
    avec_mere['famille'] = 41
    avec_mere['kid'] = True
    for series_name in ['famille', 'noifam']:
        assert_dtype(avec_mere[series_name], "int")
    assert_dtype(avec_mere.kid, "bool")

    # On récupère les mères des enfants
    mereid = DataFrame(avec_mere['noifam'].copy(
    ))  # Keep a DataFrame instead of a Series to deal with rename and merge
    # Ces mères peuvent avoir plusieurs enfants, or il faut unicité de l'identifiant
    mereid.rename(columns={'noifam': 'noindiv'}, inplace=True)
    mereid.drop_duplicates(inplace=True)
    mere = mereid.merge(base)
    mere['noifam'] = (100 * mere.ident + mere.noi).astype(int)
    mere['famille'] = 42
    for series_name in ['famille', 'noifam']:
        assert_dtype(mere[series_name], "int")
    avec_mere = avec_mere[avec_mere.noifam.isin(mereid.noindiv.values)].copy()
    log.info(u"Contrôle de famille après ajout des pères")
    control_04(mere, base)

    famille = famille[~(famille.noindiv.isin(mere.noindiv.values))].copy()
    control_04(famille, base)
    # on retrouve les conjoints des mères
    assert mere.noicon.notnull().all()
    conj_mereid = mere.loc[mere.noicon > 0,
                           ['ident', 'noicon', 'noifam']].copy()
    conj_mereid['noindiv'] = 100 * conj_mereid.ident + conj_mereid.noicon
    assert_dtype(conj_mereid[series_name], "int")
    conj_mereid = conj_mereid[['noindiv', 'noifam']].copy()
    conj_mereid = conj_mereid.merge(base)
    control_04(conj_mereid, base)
    conj_mere = conj_mereid.merge(base)
    conj_mere['famille'] = 43
    for series_name in ['famille', 'noifam']:
        assert_dtype(conj_mereid[series_name], "int")
    famille = famille[~(famille.noindiv.isin(conj_mere.noindiv.values))].copy()

    famille = concat([famille, avec_mere, mere, conj_mere])
    control_04(famille, base)
    del avec_mere, mere, conj_mere, mereid, conj_mereid

    log.info(u"    4.2 : enfants avec père")
    avec_pere = subset_base(base, famille)
    assert avec_pere.noiper.notnull().all()
    avec_pere = avec_pere[(avec_pere.lpr == 4)
                          & ((avec_pere.jeune_non_eligible_rsa == 1)
                             | (avec_pere.moins_de_15_ans_inclus == 1)) &
                          (avec_pere.noiper > 0)]
    avec_pere['noifam'] = (100 * avec_pere.ident +
                           avec_pere.noiper).astype(int)
    avec_pere['famille'] = 44
    avec_pere['kid'] = True
    # TODO: hack to deal with the problem of presence of NaN in avec_pere
    #    avec_pere.dropna(subset = ['noifam'], how = 'all', inplace = True)
    assert avec_pere['noifam'].notnull().all(), 'presence of NaN in avec_pere'
    for series_name in ['famille', 'noifam']:
        assert_dtype(avec_pere[series_name], "int")
    assert_dtype(avec_pere.kid, "bool")

    pereid = DataFrame(
        avec_pere['noifam']
    )  # Keep a DataFrame instead of a Series to deal with rename and merge
    pereid.rename(columns={'noifam': 'noindiv'}, inplace=True)
    pereid.drop_duplicates(inplace=True)
    pere = pereid.merge(base)

    pere['noifam'] = (100 * pere.ident + pere.noi).astype(int)
    pere['famille'] = 45
    famille = famille[~(famille.noindiv.isin(pere.noindiv.values))].copy()

    # On récupère les conjoints des pères
    assert pere.noicon.notnull().all()
    conj_pereid = pere.loc[pere.noicon > 0,
                           ['ident', 'noicon', 'noifam']].copy()
    conj_pereid['noindiv'] = (100 * conj_pereid.ident +
                              conj_pereid.noicon).astype(int)
    conj_pereid = conj_pereid[['noindiv', 'noifam']].copy()

    conj_pere = conj_pereid.merge(base)
    control_04(conj_pere, base)
    if len(conj_pere.index) > 0:
        conj_pere['famille'] = 46
    for series_name in ['famille', 'noifam']:
        assert_dtype(conj_pere[series_name], "int")

    famille = famille[~(famille.noindiv.isin(conj_pere.noindiv.values))].copy()
    famille = concat([famille, avec_pere, pere, conj_pere])
    log.info(u"Contrôle de famille après ajout des pères")
    control_04(famille, base)
    del avec_pere, pere, pereid, conj_pere, conj_pereid

    log.info(u"    4.3 : enfants avec déclarant")
    avec_dec = subset_base(base, famille)
    avec_dec = avec_dec[(avec_dec.persfip == "pac") & (avec_dec.lpr == 4) & (
        (avec_dec.jeune_non_eligible_rsa & ~(avec_dec.smic55)) |
        (avec_dec.moins_de_15_ans_inclus == 1))]
    avec_dec['noifam'] = (100 * avec_dec.ident +
                          avec_dec.noidec.astype('int')).astype('int')
    avec_dec['famille'] = 47
    avec_dec['kid'] = True
    for series_name in ['famille', 'noifam']:
        assert_dtype(avec_dec[series_name], "int")
    assert_dtype(avec_dec.kid, "bool")
    control_04(avec_dec, base)
    # on récupère les déclarants pour leur attribuer une famille propre
    declarant_id = DataFrame(
        avec_dec['noifam'].copy()).rename(columns={'noifam': 'noindiv'})
    declarant_id.drop_duplicates(inplace=True)
    dec = declarant_id.merge(base)
    dec['noifam'] = (100 * dec.ident + dec.noi).astype(int)
    dec['famille'] = 48
    for series_name in ['famille', 'noifam']:
        assert_dtype(dec[series_name], "int")
    famille = famille[~(famille.noindiv.isin(dec.noindiv.values))].copy()
    famille = concat([famille, avec_dec, dec])
    del dec, declarant_id, avec_dec
    control_04(famille, base)

    log.info(u"Etape 5 : Récupération des enfants fip")
    log.info(u"    5.1 : Création de la df fip")
    individual_variables_fip = [
        'acteu',
        'actrec',
        'agepf',
        'agepr',
        'cohab',
        'contra',
        'declar1',
        'forter',
        'ident',
        'lien',
        'lpr',
        'mrec',
        'naia',
        'naim',
        'noi',
        'noicon',
        'noidec',
        'noimer',
        'noindiv',
        'noiper',
        'persfip',
        'quelfic',
        'retrai',
        'rga',
        'rstg',
        'sexe',
        'stc',
        'titc',
        'year',
        'ztsai',
    ]
    fip = temporary_store['fipDat_{}'.format(
        year)][individual_variables_fip].copy()
    # Variables auxilaires présentes dans base qu'il faut rajouter aux fip'
    # WARNING les noindiv des fip sont construits sur les ident des déclarants
    # pas d'orvelap possible avec les autres noindiv car on a des noi =99, 98, 97 ,...'
    fip['moins_de_15_ans_inclus'] = (fip.agepf < 16)
    fip['jeune_non_eligible_rsa'] = ((fip.agepf >= 16) & (fip.agepf <= 20))
    fip['jeune_eligible_rsa'] = (fip.agepf >= 21)
    fip['smic55'] = (fip.ztsai >= smic * 12 * 0.55)
    fip['famille'] = 0
    fip['kid'] = False
    for series_name in [
            'kid', 'moins_de_15_ans_inclus', 'jeune_non_eligible_rsa',
            'jeune_eligible_rsa', 'smic55'
    ]:
        assert_dtype(fip[series_name], "bool")
    for series_name in ['famille']:
        assert_dtype(fip[series_name], "int")


# # base <- rbind(base,fip)
# # table(base$quelfic)

# # enfant_fip <- base[(!base$noindiv %in% famille$noindiv),]
# # enfant_fip <- subset(enfant_fip, (quelfic=="FIP") & (( (agepf %in% c(19,20)) & !smic55 ) | (naia==year & rga=='6')) )  # TODO check year ou year-1 !
# # enfant_fip <- within(enfant_fip,{
# #                      noifam=100*ident+noidec
# #                      famille=50
# #                      kid=TRUE})
# # #                     ident=NA}) # TODO : je ne sais pas quoi mettre un NA fausse les manips suivantes
# # famille <- rbind(famille,enfant_fip)
# #
# # # TODO: En 2006 on peut faire ce qui suit car tous les parents fip sont déjà dans une famille
# # parent_fip <- famille[famille$noindiv %in% enfant_fip$noifam,]
# # any(enfant_fip$noifam %in% parent_fip$noindiv)
# # parent_fip <- within(parent_fip,{
# #                      noifam <- noindiv
# #                      famille <- 51
# #                      kid <- FALSE})
# # famille[famille$noindiv %in% enfant_fip$noifam,] <- parent_fip
# # # TODO quid du conjoint ?

    log.info(u"    5.2 : extension de base avec les fip")
    base_ = concat([base, fip])
    enfant_fip = subset_base(base_, famille)
    enfant_fip = enfant_fip[(enfant_fip.quelfic == "FIP") & (
        (enfant_fip.agepf.isin([19, 20]) & ~(enfant_fip.smic55)) | (
            (enfant_fip.naia == enfant_fip.year - 1) &
            (enfant_fip.rga.astype('int') == 6)))].copy()
    enfant_fip['noifam'] = (100 * enfant_fip.ident +
                            enfant_fip.noidec).astype(int)
    enfant_fip['famille'] = 50
    enfant_fip['kid'] = True
    enfant_fip['ident'] = None  # TODO: should we really do this ?
    assert_dtype(enfant_fip.kid, "bool")
    for series_name in ['famille', 'noifam']:
        assert_dtype(enfant_fip[series_name], "int")
    control_04(enfant_fip, base)
    famille = concat([famille, enfant_fip])
    base = concat([base, enfant_fip])
    parent_fip = famille[famille.noindiv.isin(enfant_fip.noifam.values)].copy()
    assert (enfant_fip.noifam.isin(parent_fip.noindiv.values)).any(), \
        "{} doublons entre enfant_fip et parent fip !".format((enfant_fip.noifam.isin(parent_fip.noindiv.values)).sum())
    parent_fip['noifam'] = parent_fip['noindiv'].values.copy()
    parent_fip['famille'] = 51
    parent_fip['kid'] = False
    log.info(u"Contrôle de parent_fip")
    control_04(parent_fip, base)
    control_04(famille, base)
    famille = famille.merge(parent_fip, how='outer')
    del enfant_fip, fip, parent_fip
    gc.collect()
    # duplicated_individuals = famille.noindiv.duplicated()
    # TODO: How to prevent failing in the next assert and avoiding droppping duplicates ?
    # assert not duplicated_individuals.any(), "{} duplicated individuals in famille".format(
    # duplicated_individuals.sum())
    famille = famille.drop_duplicates(subset='noindiv', take_last=True)
    control_04(famille, base)

    # # message('Etape 6 : non attribué')
    # # non_attribue1 <- base[(!base$noindiv %in% famille$noindiv),]
    # # non_attribue1 <- subset(non_attribue1,
    # #                         (quelfic!="FIP") & (moins_de_15_ans_inclus | (jeune_non_eligible_rsa&(lien %in% c(1,2,3,4) & agepr>=35)))
    # #                         )
    # # # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice)
    # # non_attribue1 <- merge(pr,non_attribue1)
    # # non_attribue1 <- within(non_attribue1,{
    # #   famille <- ifelse(moins_de_15_ans_inclus,61,62)
    # #     kid <- TRUE })
    # #
    # # rm(pr)
    # # famille <- rbind(famille,non_attribue1)
    # # dup <- duplicated(famille$noindiv)
    # # table(dup)
    # # rm(non_attribue1)
    # # table(famille$famille, useNA="ifany")
    # #
    # # non_attribue2 <- base[(!base$noindiv %in% famille$noindiv) & (base$quelfic!="FIP"),]
    # # non_attribue2 <- within(non_attribue2,{
    # #   noifam <- 100*ident+noi # l'identifiant est celui du jeune */
    # #     kid<-FALSE
    # #     famille<-63})
    # #
    # # famille <- rbind(famille,non_attribue2)

    log.info(u"Etape 6 : gestion des non attribués")
    log.info(u"    6.1 : non attribués type 1")
    non_attribue1 = subset_base(base, famille)
    non_attribue1 = non_attribue1[~(non_attribue1.quelfic != 'FIP')
                                  & (non_attribue1.moins_de_15_ans_inclus |
                                     (non_attribue1.jeune_non_eligible_rsa &
                                      (non_attribue1.lien.isin(range(1, 5))) &
                                      (non_attribue1.agepr >= 35)))].copy()
    # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice)
    non_attribue1 = non_attribue1.merge(personne_de_reference)
    control_04(non_attribue1, base)
    non_attribue1[
        'famille'] = 61 * non_attribue1.moins_de_15_ans_inclus + 62 * ~(
            non_attribue1.moins_de_15_ans_inclus)
    non_attribue1['kid'] = True
    assert_dtype(non_attribue1.kid, "bool")
    assert_dtype(non_attribue1.famille, "int")
    famille = concat([famille, non_attribue1])
    control_04(famille, base)
    del personne_de_reference, non_attribue1

    log.info(u"    6.2 : non attribué type 2")
    non_attribue2 = base[(~(base.noindiv.isin(famille.noindiv.values)) &
                          (base.quelfic != "FIP"))].copy()
    non_attribue2['noifam'] = (100 * non_attribue2.ident +
                               non_attribue2.noi).astype(int)
    non_attribue2['kid'] = False
    non_attribue2['famille'] = 63
    assert_dtype(non_attribue2.kid, "bool")
    for series_name in ['famille', 'noifam']:
        assert_dtype(non_attribue2[series_name], "int")
    famille = concat([famille, non_attribue2], join='inner')
    control_04(famille, base)
    del non_attribue2

    # Sauvegarde de la table famille
    log.info(u"Etape 7 : Sauvegarde de la table famille")
    log.info(u"    7.1 : Mise en forme finale")
    #    TODO: nettoyer les champs qui ne servent plus à rien
    #    famille['idec'] = famille['declar1'].str[3:11]
    #    famille['idec'].apply(lambda x: str(x)+'-')
    #    famille['idec'] += famille['declar1'].str[0:2]
    famille['chef'] = (famille['noifam'] == (100 * famille.ident +
                                             famille.noi))
    assert_dtype(famille.chef, "bool")

    famille.reset_index(inplace=True)
    control_04(famille, base)

    log.info(u"    7.2 : création de la colonne rang")

    famille['rang'] = famille.kid.astype('int')
    while any(famille[(famille.rang != 0)].duplicated(
            subset=['rang', 'noifam'])):
        famille.rang.loc[famille.rang != 0] += famille[famille.rang != 0].copy(
        ).duplicated(subset=["rang", 'noifam']).values
        log.info(u"nb de rangs différents : {}".format(
            len(set(famille.rang.values))))

    log.info(u"    7.3 : création de la colonne quifam et troncature")
    log.info(u"value_counts chef : \n {}".format(
        famille['chef'].value_counts()))
    log.info(u"value_counts kid :' \n {}".format(
        famille['kid'].value_counts()))

    famille['quifam'] = -1
    # famille['quifam'] = famille['quifam'].where(famille['chef'].values, 0)
    # ATTENTTION : ^ stands for XOR
    famille.quifam = (0 + ((~famille['chef']) &
                           (~famille['kid'])).astype(int) +
                      famille.kid * famille.rang).astype('int')

    # TODO: Test a groupby to improve the following this (should be placed )
    #    assert famille['chef'].sum() == len(famille.noifam.unique()), \
    #      'The number of family chiefs {} is different from the number of families {}'.format(
    #          famille['chef'].sum(),
    #          len(famille.idfam.unique())
    #          )

    #    famille['noifam'] = famille['noifam'].astype('int')
    log.info(u"value_counts quifam : \n {}".format(
        famille['quifam'].value_counts()))
    famille = famille[['noindiv', 'quifam', 'noifam']].copy()
    famille.rename(columns={'noifam': 'idfam'}, inplace=True)
    log.info(u"Vérifications sur famille")
    # TODO: we drop duplicates if any

    duplicated_famillle_count = famille.duplicated(
        subset=['idfam', 'quifam']).sum()
    if duplicated_famillle_count > 0:
        log.info(
            u"There are {} duplicates of quifam inside famille, we drop them".
            format())
        famille.drop_duplicates(subset=['idfam', 'quifam'], inplace=True)
        # assert not(famille.duplicated(subset = ['idfam', 'quifam']).any()), \
        #   'There are {} duplicates of quifam inside famille'.format(
        #       famille.duplicated(subset = ['idfam', 'quifam']).sum())

    temporary_store["famc_{}".format(year)] = famille
    del indivi, enfants_a_naitre
Exemple #59
0
    def label_processing(self,
                         user_df: pd.DataFrame,
                         item_df: pd.DataFrame,
                         user_his_df: pd.DataFrame,
                         label_df: pd.DataFrame,
                         u_id_col: str,
                         i_id_col: str,
                         y_label: str,
                         start_index=0,
                         end_index=-1,
                         fill_na=0,
                         **kwargs):
        """
        对原始数据进行预处理,输入为pandas dataframe,输出为直接入模的numpy array
        Args:
            user_df: 静态数据的Dataframe
            item_df: 时序数据的Dataframe
            user_his_df: 时序数据的Dataframe
            label_df: Y标签的Dataframe training为True时起作用
            u_id_col: user_id的列名
            i_id_col: item_id的列名
            start_index: 本次批量的开始位置
            end_index: 本次批量的结束位置
            y_label: Y标签列名  training为True时起作用
            fill_na: 缺失值填充
        """
        if type(user_df) != pd.DataFrame or type(item_df) != pd.DataFrame or type(user_his_df) != pd.DataFrame\
                or type(label_df) != pd.DataFrame:
            raise ValueError(
                "Error: Input X data must be Pandas.DataFrame format.\n输入数据必须是Pandas DataFrame格式!"
            )
        # 填充缺失值
        user_df = user_df.fillna(fill_na)
        item_df = item_df.fillna(fill_na)
        user_his_df = user_his_df.fillna(fill_na)

        # 先对离散字段做编码
        for col in self.i_discrete_cols:
            le = self.le_dict[col]
            tmp = item_df[col].map(lambda s: fill_na
                                   if s not in le.classes_ else s)
            item_df[col + str('_transformed')] = le.transform(tmp)

        if end_index == -1:
            end_index = len(user_df)
        u_ids = user_df.iloc[start_index:end_index][[u_id_col]]
        # 取出本次batch的数据
        user_batch_df = user_df[user_df[u_id_col].isin(u_ids[u_id_col])]
        # 笛卡尔积之前先做编码
        for col in self.u_discrete_cols:
            le = self.le_dict[col]
            tmp = user_batch_df[col].map(lambda s: fill_na
                                         if s not in le.classes_ else s)
            user_batch_df[col + str('_transformed')] = le.transform(tmp)

        user_batch_his_df = user_his_df[user_his_df[u_id_col].isin(
            u_ids[u_id_col])]
        # 先对离散字段做编码
        for col in self.u_history_cols:
            item_col_name = self.u_history_col_names[self.u_history_cols.index(
                col)]
            le = self.le_dict[item_col_name]
            tmp = user_batch_his_df[col].map(lambda s: fill_na
                                             if s not in le.classes_ else s)
            user_batch_his_df[col + str('_transformed')] = le.transform(tmp)

        # 获取用户侧输入特征
        label_df = label_df.drop_duplicates([u_id_col, i_id_col],
                                            keep='first')  # 防止出现脏数据
        print("label_df长度:" + str(len(label_df)))
        print("用户长度:" + str(len(user_batch_df)))
        basic_df = pd.merge(user_batch_df,
                            label_df,
                            how='inner',
                            on=[u_id_col])
        print("和用户拼接后长度:" + str(len(basic_df)))
        basic_df = pd.merge(basic_df, item_df, how='inner', on=[i_id_col])
        print("和图片拼接后长度:" + str(len(basic_df)))
        basic_df['id'] = basic_df[u_id_col] + "@" + basic_df[i_id_col]
        user_batch_his_df = pd.merge(basic_df[['id', u_id_col]],
                                     user_batch_his_df,
                                     how='left',
                                     on=[u_id_col])
        user_batch_his_df = user_batch_his_df.fillna(fill_na)  # 对没有任何历史记录的用户填充
        print("数据预处理完成")

        # 拼接输入特征
        X_input = [
            self.scalar_dict['scalar_user'].transform(
                basic_df[self.u_continue_cols])
        ]
        for col in self.u_discrete_cols:
            X_input.append(basic_df[col + str('_transformed')])
        print("用户基础特征处理完成")

        for col in self.u_history_cols:
            constant_fill_na = dict(zip(le.classes_, le.transform(
                le.classes_)))[fill_na]  # 取出fill_na在编码后的值
            user_batch_col_tmp = user_batch_his_df[['id', col+str('_transformed')]].groupby(['id'])\
                .apply(lambda x: np.pad(x[col+str('_transformed')].values, (self.u_history_col_ts_step[col]-len(x[col+str('_transformed')].values), 0),'constant',
                constant_values=constant_fill_na) if len(x[col+str('_transformed')].values) < self.u_history_col_ts_step[col] else x[col+str('_transformed')].values)
            user_batch_col_tmp = np.stack(user_batch_col_tmp.values)
            X_input.append(user_batch_col_tmp)
        print("用户历史记录特征处理完成")

        # 拼接物品端输入特征
        X_input.append(self.scalar_dict['scalar_item'].transform(
            basic_df[self.i_continue_cols]))
        for col in self.i_discrete_cols:
            X_input.append(basic_df[col + str('_transformed')])
        print("物品特征处理完成")
        return X_input, basic_df[y_label].values, basic_df['id'].values
Exemple #60
0
# -*- coding: utf-8 -*
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
data = {
    'chinese': [66, 95, 95, 90, 80, 80],
    'english': [65, 85, 92, 88, 90, 90],
    'math': [np.nan, 98, 96, 77, 90, 90]
}
df2 = DataFrame(data,
                index=[
                    'zhangfei', 'guanyu', 'zhaoyun', 'huangzhong', 'dianwei',
                    'dianwei'
                ],
                columns=['chinese', 'english', 'math'])
df2.drop_duplicates(inplace=True)
print(df2)
df2.rename(columns={
    'chinese': '语文',
    'english': '英语',
    'math': '数学'
},
           index={
               'zhangfei': '张飞',
               'guanyu': '关羽',
               'zhaoyun': '赵云',
               'huangzhong': '黄忠',
               'dianwei': '典韦'
           },
           inplace=True)
df2.isnull()