Example #1
0
def removeDuplicate(file):
    """Removes duplicate points based on X, Y coordinates

       Returns a numpy array"""
    df = DataFrame(np.vstack((file.x, file.y, file.z)).transpose(), columns=['X', 'Y', 'Z'])
    df.drop_duplicates(subset=['X','Y'], inplace=True)
    return df.values
Example #2
0
def test_drop_duplicates_tuple():
    df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
                                   'foo', 'bar', 'bar', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1, 1, 2, 2, 2, 2, 1, 2],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates(('AA', 'AB'))
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AA', 'AB'), keep='last')
    expected = df.loc[[6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AA', 'AB'), keep=False)
    expected = df.loc[[]]  # empty df
    assert len(result) == 0
    tm.assert_frame_equal(result, expected)

    # multi column
    expected = df.loc[[0, 1, 2, 3]]
    result = df.drop_duplicates((('AA', 'AB'), 'B'))
    tm.assert_frame_equal(result, expected)
Example #3
0
def get_link(url):
	link_exr = re.compile(r'<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>')
	links = []
	
	# open web content
	f = urllib2.urlopen(url)
	content = f.read()
	
	# versi find html tag : find all url and save to links
	# soup = BeautifulSoup(content, "lxml")
	# for a in soup.find_all('a', href=True):
	# 	if "detik.com" in a['href']:
	# 		if "http:" not in a['href']:
	# 			a['href'] = "http:" + a['href']
	# 		print "Found the URL:", a['href']
	# 		links.append(a['href'])
			
	# versi regex : find all url and save to links			
	for link in link_exr.findall(content):
		if "detik.com" in link[0]:
			link_detik = link[0]
			if "http:" not in link_detik:
				link_detik = "http:" + link_detik
			links.append(link_detik)
	
	# save to DataFrame
	df = DataFrame(links, columns=['detik url'])
	df.drop_duplicates()

	print df.head(0)
		# create and save to sqlite database
	detik_db = create_engine("mysql://root:root@localhost/data_detik") 
	df.to_sql('url_detik', detik_db, if_exists='replace')
def process_duplicated_entries(dfm_stk_strc:DataFrame,stockid):
    dfm_duplicated = dfm_stk_strc[dfm_stk_strc.duplicated(['变动日期'])]
    # print(dfm_duplicated)
    dfm_stk_strc.drop_duplicates('变动日期',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        # dfm_stk_strc.loc[index]['变动原因'] = dfm_stk_strc.loc[index]['变动原因'] +'|'+row['变动原因']
        dfm_stk_strc.loc[index,'变动原因'] = dfm_stk_strc.loc[index]['变动原因'] + '|' + row['变动原因']
        logprint('Stock %s 变动日期 %s 记录合并到主记录中. %s' %(stockid,row['变动日期'],tuple(row)))
Example #5
0
def test_duplicated_with_misspelled_column_name(subset):
    # GH 19730
    df = DataFrame({'A': [0, 0, 1],
                    'B': [0, 0, 1],
                    'C': [0, 0, 1]})

    with pytest.raises(KeyError):
        df.duplicated(subset)

    with pytest.raises(KeyError):
        df.drop_duplicates(subset)
Example #6
0
def test_drop_duplicates_with_duplicate_column_names():
    # GH17836
    df = DataFrame([
        [1, 2, 5],
        [3, 4, 6],
        [3, 4, 7]
    ], columns=['a', 'a', 'b'])

    result0 = df.drop_duplicates()
    tm.assert_frame_equal(result0, df)

    result1 = df.drop_duplicates('a')
    expected1 = df[:2]
    tm.assert_frame_equal(result1, expected1)
Example #7
0
    def _decode_solutions(self, solutions):
        decoded_solutions = DataFrame(columns=["targets", "fitness"])
        index = 0
        for solution in solutions:
            combinations = self._decoder(solution.candidate, flat=True, decompose=True)
            for targets in combinations:
                if len(targets) > 0:
                    decoded_solutions.loc[index] = [tuple(targets), solution.fitness]
                    index += 1

        decoded_solutions.drop_duplicates(inplace=True, subset="targets")
        decoded_solutions.reset_index(inplace=True)

        return decoded_solutions
Example #8
0
    def get_dominant_alleles( sample_ids, marker_ids = None ):

        t1 = aliased( Allele )
        t2 = aliased( Allele )
        s1 = aliased( AlleleSet )
        marker_ids = [ int(x) for x in marker_ids ]

        # I honestly forgot how the mundane thing below works !!!
        # I was smarter when I was younger 8-(
        q = dbsession.query( t1.marker_id, s1.sample_id, t1.value, t1.size, t1.height ).\
            join( s1 ).\
            outerjoin( t2, and_( t1.marker_id == t2.marker_id,
                                t1.alleleset_id == t2.alleleset_id,
                                t1.height < t2.height) ).\
            filter( t2.marker_id == None ).order_by( t1.marker_id, s1.sample_id ).\
            filter( s1.sample_id.in_( sample_ids ) ).filter( t1.marker_id.in_( marker_ids ))

        df = DataFrame( [ (marker_id, sample_id, value, size, height)
                        for marker_id, sample_id, value, size, height in q ] )

        if len(df) == 0:
            return None

        df.columns = ( 'marker_id', 'sample_id', 'value', 'size', 'height' )
        return df.drop_duplicates( ['marker_id', 'sample_id'] )
def recursive_add_consumers(consumer_id, seen = set([])):
        if consumer_id is None:
            return

        seen.add(consumer_id)
        consumer_key = sample[sample.Consumer == consumer_id]
        IP = df.drop_duplicates(df(consumer_key.IP))

        n = np.array(np.arange(len(IP)))

        IP_Map = set([])
        for i in n:
          value = sample[sample.IP.isin([IP.iloc[i,0]])]
          IP_Map.add(value)

        #print IP_Map

        print consumer_id
        print seen
        consumer_list = []

        #list of unique consumers that are linked to this one
        [consumer_list.extend(y.Consumer.iloc[l].tolist()) for l in [range(len(y.Consumer)) for y in IP_Map]]

        #print consumer_list
        #print [x for x in set(consumer_list).difference([consumer_id])]
        #unique_consumer_list = []
        #print [ x for x in set([y.Consumer.iloc[0] for y in IP_Map])]


        #tuples of ips and unique consumers attached to them
        print [(y.IP.iloc[0],set(y.Consumer.iloc[l].tolist())) for l in [range(len(y.Consumer)) for y in IP_Map]]
Example #10
0
 def getDateTimeSeries(self,instrument=None):
     if instrument is None:
        __dateTime = DataFrame()
        for element in self.__instrument:
            __dateTime = __dateTime.append(self.__feed[element].getPriceDataSeries().getDateTimes())
        __dateTime = __dateTime.drop_duplicates([0])
        return __dateTime.values #此时返回的为二维数组
     return self.__feed[instrument].getPriceDataSeries().getDateTimes()
def process_duplicated_entries(dfm_stk_info:DataFrame,stockid):
    dfm_duplicated = dfm_stk_info[dfm_stk_info.duplicated(['股权登记日'])]
    # print(dfm_duplicated)
    dfm_stk_info.drop_duplicates('股权登记日',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        dfm_stk_info.loc[index,'分红年度'] = add_considering_None(dfm_stk_info.loc[index]['分红年度'],row['分红年度'])
        dfm_stk_info.loc[index,'分红方案'] = dfm_stk_info.loc[index]['分红方案'] + '|' + row['分红方案']
        if dfm_stk_info.loc[index]['方案文本解析错误标识位'] !='E':
            if row['方案文本解析错误标识位'] == 'E':
                dfm_stk_info.loc[index, '方案文本解析错误标识位'] = 'E'
                dfm_stk_info.loc[index, '派息(税前)(元)/10股'] = None
                dfm_stk_info.loc[index, '转增(股)/10股'] = None
                dfm_stk_info.loc[index, '送股(股)/10股'] = None
            else:
                dfm_stk_info.loc[index,'派息(税前)(元)/10股'] = add_considering_None(dfm_stk_info.loc[index]['派息(税前)(元)/10股'],row['派息(税前)(元)/10股'])
                dfm_stk_info.loc[index,'转增(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['转增(股)/10股'] , row['转增(股)/10股'])
                dfm_stk_info.loc[index,'送股(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['送股(股)/10股'] , row['送股(股)/10股'])
        logprint('Stock %s 股权登记日 %s 记录合并到主记录中. %s' %(stockid,row['股权登记日'],tuple(row)))
Example #12
0
    def _compute_consistency(self):
        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df = df.drop_duplicates()
        tmp = df.groupby(['e_type', 'entity']).size().reset_index()
        tmp.rename(columns={0: 'consistency'}, inplace=True)

        return tmp
Example #13
0
 def compute_ambiguity(self):
     results = self.cursor.execute('SELECT type, entity FROM entities')
     tmp = results.fetchall()
     freq_df = DataFrame(tmp, columns=['e_type', 'entity'])
     freq_df['ambiguity'] = 10
     freq_df = freq_df.drop_duplicates()
     result_computed_location = self._compute_location_ambiguity(freq_df)
     result_computed_name = self._compute_name_ambiguity(result_computed_location)
     return result_computed_name
def IP_Weight_Calc(consumer_id):

        if consumer_id == None:
            return
        consumer_key = sample[sample.Consumer == consumer_id]

        IP = df.drop_duplicates(df(consumer_key.IP))

        n = np.array(np.arange(len(IP)))
        IP_Weight_List = []

        for i in n:
            value = sample[sample.IP.isin([IP.iloc[i,0]])]
            value2 = len(df.drop_duplicates(df(value.Consumer)))
            value3 = (1/(value2**2))
            IP_Weight_List.append(value3)

        return sum(IP_Weight_List)
def getSubwayDate(filepath2):
    allFiles = DataFrame(os.listdir(filepath2))
    allFiles.columns = ['FileNames']
    allFiles = allFiles[allFiles.FileNames.str.slice(0, 6) == 'gtfs-2']
    allFiles['datetime1'] = allFiles.FileNames.str.slice(5)
    allFiles.datetime1 = allFiles.datetime1.map(parser.parse)
    allFiles.datetime1 = pd.DatetimeIndex(allFiles.datetime1).tz_convert('America/New_York')
    times1 = []
    for dt in allFiles.datetime1:
        tm = dt.time()
        times1.append(tm)
    allFiles['time1'] = Series(times1)
    allFiles = allFiles[allFiles.time1 >= time(6)]
    allFiles = allFiles[allFiles.time1 <= time(9, 30)]
    
    df_allTrains = DataFrame(np.zeros(0, dtype = [('current_status', 'O')]))
    df_trips = DataFrame(np.zeros(0, dtype = [('route_id', 'O')]))
    #old_df = getTrains(allFiles.FileNames.iloc[0], test_df)
    for fileName0 in allFiles.FileNames:
        fileName1 = os.path.join(filepath2,fileName0)
        try:
            df_allTrains, df_trips = getTrains(fileName1, df_allTrains, df_trips)
        except:
            print 'file ' + fileName0 + ' did not work'
            continue
        
    df_allTrains = df_allTrains.sort_values(['trip_id', 'timestamp', 'timestamp1'])
    unique_ids = ['start_date','route_id','trip_id', 'train_id', 'stop_id', 'stop_name', 'current_status', 'timestamp']
    df_allTrains.drop_duplicates(unique_ids, keep = 'last', inplace = True)
    unique_ids.remove('timestamp')
    df_grouped = df_allTrains.groupby(unique_ids)
    df_grouped1 = df_grouped['timestamp'].agg({'minTS1': min, 'maxTS1':max})
    df_grouped2 = df_grouped['timestamp1'].agg(max)
    df_grouped1['maxTS2'] = df_grouped2
    df_allTrains = df_grouped1.reset_index()
    df_allTrains.sort_values(['trip_id', 'minTS1'], inplace = True)
    
    df_trips = df_trips.sort_values(['trip_id', 'stop_id', 'timestamp1'])  
    
    shutil.rmtree(filepath2)
    return(df_allTrains, df_trips)
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
def bulk_download(year, bucket):
    bucket = expanduser(bucket)
    filepath = bucket + 'latvia' + str(year) + '.csv'

    data = DataFrame()

    for scheme in SCHEMES:
        fragment = Fragment(scheme, year, bucket)

        if not fragment.is_cached:
            fragment.download()

        fragment.load_from_csv()
        fragment.cleanup()

        data = concat([data, fragment.data], ignore_index=True)
        log.debug('Added %s rows to bulk dataframe', len(fragment.data))

    data.drop_duplicates(inplace=True)
    data.to_csv(filepath, encoding='utf-8', mode='w+')
    log.info('Bulk download saved to %s (%s rows)', filepath, len(data))
Example #18
0
def test_drop_duplicates_NA_for_take_all():
    # none
    df = DataFrame({'A': [None, None, 'foo', 'bar',
                          'foo', 'baz', 'bar', 'qux'],
                    'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})

    # single column
    result = df.drop_duplicates('A')
    expected = df.iloc[[0, 2, 3, 5, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('A', keep='last')
    expected = df.iloc[[1, 4, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('A', keep=False)
    expected = df.iloc[[5, 7]]
    tm.assert_frame_equal(result, expected)

    # nan

    # single column
    result = df.drop_duplicates('C')
    expected = df.iloc[[0, 1, 5, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('C', keep='last')
    expected = df.iloc[[3, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('C', keep=False)
    expected = df.iloc[[5, 6]]
    tm.assert_frame_equal(result, expected)
Example #19
0
def test_drop_duplicates_for_take_all():
    df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
                            'foo', 'bar', 'qux', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1, 1, 2, 2, 2, 2, 1, 2],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates('AAA')
    expected = df.iloc[[0, 1, 2, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep='last')
    expected = df.iloc[[2, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep=False)
    expected = df.iloc[[2, 6]]
    tm.assert_frame_equal(result, expected)

    # multiple columns
    result = df.drop_duplicates(['AAA', 'B'])
    expected = df.iloc[[0, 1, 2, 3, 4, 6]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['AAA', 'B'], keep='last')
    expected = df.iloc[[0, 1, 2, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(['AAA', 'B'], keep=False)
    expected = df.iloc[[0, 1, 2, 6]]
    tm.assert_frame_equal(result, expected)
Example #20
0
def get_components_yahoo(idx_sym):
    """
    Returns DataFrame containing list of component information for
    index represented in idx_sym from yahoo. Includes component symbol
    (ticker), exchange, and name.

    Parameters
    ----------
    idx_sym : str
        Stock index symbol
        Examples:
        '^DJI' (Dow Jones Industrial Average)
        '^NYA' (NYSE Composite)
        '^IXIC' (NASDAQ Composite)

        See: http://finance.yahoo.com/indices for other index symbols

    Returns
    -------
    idx_df : DataFrame
    """
    stats = 'snx'
    #URL of form:
    #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
    url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \
          '&e=.csv&h={2}'

    idx_mod = idx_sym.replace('^', '@%5E')
    urlStr = url.format(idx_mod, stats, 1)

    idx_df = DataFrame()
    mask = [True]
    comp_idx = 1

    #LOOP across component index structure,
    #break when no new components are found
    while (True in mask):
        urlStr = url.format(idx_mod, stats,  comp_idx)
        lines = (urllib.urlopen(urlStr).read().decode('utf-8').strip().
                 strip('"').split('"\r\n"'))

        lines = [line.strip().split('","') for line in lines]

        temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
        temp_df = temp_df.drop_duplicates()
        temp_df = temp_df.set_index('ticker')
        mask = ~temp_df.index.isin(idx_df.index)

        comp_idx = comp_idx + 50
        idx_df = idx_df.append(temp_df[mask])

    return idx_df
Example #21
0
def get_components_yahoo(idx_sym):
    """
    Returns DataFrame containing list of component information for
    index represented in idx_sym from yahoo. Includes component symbol
    (ticker), exchange, and name.

    Parameters
    ----------
    idx_sym : str
        Stock index symbol
        Examples:
        '^DJI' (Dow Jones Industrial Average)
        '^NYA' (NYSE Composite)
        '^IXIC' (NASDAQ Composite)

        See: http://finance.yahoo.com/indices for other index symbols

    Returns
    -------
    idx_df : DataFrame
    """
    stats = "snx"
    # URL of form:
    # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
    url = _YAHOO_COMPONENTS_URL + "s={0}&f={1}&e=.csv&h={2}"

    idx_mod = idx_sym.replace("^", "@%5E")
    url_str = url.format(idx_mod, stats, 1)

    idx_df = DataFrame()
    mask = [True]
    comp_idx = 1

    # LOOP across component index structure,
    # break when no new components are found
    while True in mask:
        url_str = url.format(idx_mod, stats, comp_idx)
        with urlopen(url_str) as resp:
            raw = resp.read()
        lines = raw.decode("utf-8").strip().strip('"').split('"\r\n"')
        lines = [line.strip().split('","') for line in lines]

        temp_df = DataFrame(lines, columns=["ticker", "name", "exchange"])
        temp_df = temp_df.drop_duplicates()
        temp_df = temp_df.set_index("ticker")
        mask = ~temp_df.index.isin(idx_df.index)

        comp_idx = comp_idx + 50
        idx_df = idx_df.append(temp_df[mask])

    return idx_df
def drop_reqpeat01():
    data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
    print data
    print data.duplicated()
    print data.drop_duplicates()
    data['v1']=range(7)
    print data.drop_duplicates(['k1'])
    print data
    print data.drop_duplicates(['k1','k2'],keep='last')
Example #23
0
File: io.py Project: dhhagan/ACT
def read_thermo_dat(model='nox', runDir=os.getcwd(), sample_int='1min', start=None, end=None):
    '''
        Reads thermo data from .dat file type.
        model = one of: nox, sox, or o3
        returns the number of files read and DataSeries containing all data munged and organized for the user for the 
        Thermo Scientific line of atmospheric gas analyzers
			
		>>>filecount, no = read_thermo_dat('o3', runDir=dataDir)
	'''

    # If the data is in a different directory, change the directory
    if os.getcwd() != runDir:
        os.chdir(runDir)
    
    # Set the model name based on input
    if model =='nox':
        instrument = '42I'
    elif model == 'sox':
        instrument = '43I'
    elif model == 'o3':
        instrument = '49I'
    else:
        sys.exit("The model you defined is not valid or supported yet.")
    
    # grab all files in the directory for a given instrument with the .dat file extension
    files = get_files(instrument,fileType='dat',start=start, end=end, runDir=runDir)
    
    fileNo = 1
    data = DataFrame()
    
    # Concatenate the data from each file together to build one big dataframe
    for each in files:
        newData = pd.read_table(each, sep='\s+', skiprows=4, header=5, parse_dates=[[1,0]], keep_date_col=True, index_col='Date_Time', warn_bad_lines=True)
        data = pd.concat([data, newData])
        fileNo += 1
      
    # Create a duplicate column containing the index to easily drop all duplicate rows from merging files containing
    #  the same data
    data['stamp'] = data.index
    data = data.drop_duplicates(cols='stamp')
    
    # Depending on the model, do some stuff to clean it up
    if model == 'nox':
        data['no2'] = data['nox'] - data['no']
        
    # resample the data based on chosen imput
    data = data.resample(sample_int)
    
    return (fileNo, data)
def PrepeareAndSave(uid, dataToSearch, data, fileName="data.csv"):
    people = []
    address = []
    poi = []
    activity = []
    timeB = []
    timeE = []
    entities = []
    taxonomy = []
    concepts = []
    txt = []

    for i in xrange(0, len(dataToSearch["people"])):
        for j in xrange(0, len(data[i][2])):
            people.append(dataToSearch["people"][i])
            address.append(dataToSearch["address"][i])
            poi.append(dataToSearch["poi"][i])
            activity.append(dataToSearch["activity"][i])
            timeB.append(dataToSearch["timeB"][i])
            timeE.append(dataToSearch["timeE"][i])

            entities.append(data[i][2][j])
            taxonomy.append(data[i][3][j])
            concepts.append(data[i][4][j])
            txt.append(base64.b64encode(data[i][1][j]))

    df = DataFrame()
    df["people"] = people
    df["address"] = address
    df["poi"] = poi
    df["activity"] = activity
    df["entities"] = entities
    df["taxonomy"] = taxonomy
    df["concepts"] = concepts
    df["timeB"] = timeB
    df["timeE"] = timeE
    df["txt"] = txt
    df = df.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1)
    # print df
    try:
        df_old = read_csv(ROOT + str(uid) + "/" + fileName, ";")
        df_new = [df_old, df]
        df = pd.concat(df_new).drop(["Unnamed: 0"], axis=1)
    except:
        print ("New Data")
    df.to_csv(ROOT + str(uid) + "/" + fileName, sep=";")
    return df
def slide_10():
    data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                      'k2': [1, 1, 2, 3, 3, 4, 4]})
    print data
    print data.duplicated()
    print data.duplicated('k1')
    print data.drop_duplicates()

    data['v1'] = range(7)
    print data
    print data.drop_duplicates(['k1'])
    print data.drop_duplicates(['k1', 'k2'], take_last=True)
Example #26
0
    def compute_tf_idf(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])

        base_df = df[['e_type', 'entity']]
        base_df = base_df.drop_duplicates()

        doc_t_df = df.drop_duplicates().groupby('entity').size()

        results = self.cursor.execute('SELECT did, total_word FROM documents')
        tmp = results.fetchall()
        df2 = DataFrame(tmp, columns=['did', 'total_word'])

        tmp = df[['did', 'entity']].groupby(['did', 'entity']).size().reset_index()
        tmp.rename(columns={0: 'term_freq'}, inplace=True)

        tf_idf_list = []

        for row in tmp.iterrows():
            values = row[1]
            did = values[0]
            entity = values[1]
            term_freq = values[2]
            total_word = df2[df2['did'] == did]['total_word'].get_values()[0]
            tf = float(term_freq) / total_word
            doc_t = doc_t_df.get_value(entity)
            idf = np.log(total_doc / doc_t)
            tf_idf = tf * idf
            tf_idf_list.append([entity, tf_idf])

        tf_idf_df = DataFrame(tf_idf_list, columns=['entity', 'tf_idf'])
        tf_idf_df = tf_idf_df.groupby('entity').agg('sum')

        base_df.loc[:, 'tf_idf'] = base_df['entity'].apply(lambda x: tf_idf_df['tf_idf'][x])

        return base_df
def deal_string02():
    import json
    db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json'))
    print len(db)
    print db[0]
    print db[0].keys()
    print db[0]['nutrients'][0]
    nutrients=DataFrame(db[0]['nutrients'])
    print nutrients[:7]
    info_keys=['description','group','id','manufacturer']
    info=DataFrame(db,columns=info_keys)
    print info[:5]
    print pd.value_counts(info.group)[:10]

    nutrients=[]
    for rec in db:
        fnuts=DataFrame(rec['nutrients'])
        fnuts['id']=rec['id']
        nutrients.append(fnuts)
    nutrients=pd.concat(nutrients,ignore_index=True)
    print nutrients
    print nutrients.duplicated().sum()
    nutrients=nutrients.drop_duplicates()
    col_mapping={'description':'food','group':'fgroup'}
    info=info.rename(columns=col_mapping,copy=False)
    print info
    col_mapping={'description':'nutrient','group':'nutgroup'}
    nutrients=nutrients.rename(columns=col_mapping,copy=False)
    print nutrients
    ndata=pd.merge(nutrients,info,on='id',how='outer')
    print ndata
    print ndata.ix[3000]
    result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
    # print result
    result['Zinc, Zn'].sort_values().plot(kind='barh')
    by_nutrient=ndata.groupby(['nutgroup','nutrient'])
    get_maximum=lambda x:x.xs(x.value.idxmax())
    get_minimum=lambda x:x.xs(x.value.idmin())
    max_foods=by_nutrient.apply(get_maximum)[['value','food']]
    max_foods.food=max_foods.food.str[:50]
    print max_foods.ix['Amino Acids']['food']
Example #28
0
def hierarchical_clusters( log, show_plot=None ):
    """Translates traces to Parikh vectors and computes in the vector space
       a hierarchical clustering."""
    def get_parikh(case,alphabet):
        v = zeros(len(alphabet),dtype=int)
        for act in case:
            v[alphabet[act]] = v[alphabet[act]] +1
        # canonical representation
        m = min(v)
        return v - m   
    
    actsind = {}
    i = 0
    for act in log.get_alphabet():
        actsind[act] = i
        i = i +1

    uniq_cases = log.get_uniq_cases()
    N = len(uniq_cases)
    M = len(actsind)
    data = zeros((N,M),dtype=int)
    i = 0
    parikhdict = {}
    for case in uniq_cases.keys():
        data[i] = get_parikh(case,actsind)
        str_i = ','.join(map(str,data[i]))
        if str_i not in parikhdict:
            parikhdict[str_i] = [i]
        else:
            parikhdict[str_i].append(i)
        i = i + 1
    df = DataFrame(data)
    data_uniq = df.drop_duplicates()
    Y = pdist(data_uniq,metric='euclidean')
    Z = linkage(Y,method='average')
    dendrogram(Z)
    show()
Example #29
0
def fix_event_type(df: DataFrame):
    '''
    Not sure yet.
    :param df: Dataframe object.
    :return: Modified Dataframe.
    '''

    a = time.time()

    colsf = df['id'].ravel()            # list of all IDs
    unique = pd.Series(colsf).unique()  # get unique IDs
    u_counts = []                       # list of unique counts (UNUSED)
    counts_bucket = []                  # bucket of counts (UNUSED)
    df = pd.get_dummies(df)             # create dummy variables
    todrop = df.sum() < 50              # get columns where sum of dummy column < 50
    dropcols = df.columns[todrop]       # get those column names
    df = df.drop(dropcols, axis=1)      # drop those columns
    df['num_events'] = 0                # create number of events columns, set to 0
    # print(df.columns)
    print(str(len(unique)))

    for ii in range(0,len(unique)):     # loop through all the unique IDs
        subset = df.loc[df['id'] == unique[ii]]     # subset by that ID
        the_dummies = subset.columns != 'id'        # get all columns that do not equal that ID
        aa = subset.iloc[:, subset.columns != 'id'].sum().tolist()  # get all of those columns to list
        event_sum = np.sum(aa)      # sum all of those
        
        # aa = aa.set_index([[subset.index[0]]])
        # subset.iloc[:,subset.columns != 'id'] = aa
        df = df.set_value(subset.index, the_dummies, aa)
        df = df.set_value(subset.index, 'num_events', event_sum)
        # df.loc[subset.index] = subset
    df = df.drop_duplicates('id')
    print(df)
    b = time.time()
    print(b-a)
    return df
Example #30
0
class DropDuplicates(object):

    goal_time = 0.2
    params = [True, False]
    param_names = ['inplace']

    def setup(self, inplace):
        N = 10000
        K = 10
        key1 = tm.makeStringIndex(N).values.repeat(K)
        key2 = tm.makeStringIndex(N).values.repeat(K)
        self.df = DataFrame({'key1': key1, 'key2': key2,
                             'value': np.random.randn(N * K)})
        self.df_nan = self.df.copy()
        self.df_nan.iloc[:10000, :] = np.nan

        self.s = Series(np.random.randint(0, 1000, size=10000))
        self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))

        N = 1000000
        K = 10000
        key1 = np.random.randint(0, K, size=N)
        self.df_int = DataFrame({'key1': key1})
        self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
                                                   dtype=bool))

    def time_frame_drop_dups(self, inplace):
        self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_frame_drop_dups_na(self, inplace):
        self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)

    def time_series_drop_dups_int(self, inplace):
        self.s.drop_duplicates(inplace=inplace)

    def time_series_drop_dups_string(self, inplace):
        self.s_str.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_int(self, inplace):
        self.df_int.drop_duplicates(inplace=inplace)

    def time_frame_drop_dups_bool(self, inplace):
        self.df_bool.drop_duplicates(inplace=inplace)