Esempio n. 1
0
    def handle_community(self, community, **options):
        #from mpl_toolkits.mplot3d import Axes3D
        canvas = pyplot.figure().gca(projection='3d')

        clothing_vectors = numpy.array([
            cast_elements_to_floats(individual["vectors"]) for individual in community.kernel.individual_set.all()
        ])
        centroids, labels = kmeans2(clothing_vectors, 10, minit="points")

        clothing_frame = DataFrame()
        clothing_by_cluster = sorted(zip(labels, clothing_vectors), key=itemgetter(0))
        current_label = None
        for label, vector in clothing_by_cluster:
            if label != current_label:
                current_label = label
                clothing_frame = clothing_frame.append(Series(data=centroids[current_label]), ignore_index=True)
            clothing_frame = clothing_frame.append(Series(data=vector), ignore_index=True)

        #centroids_frame = DataFrame(centroids)
        #centroids_frame.T.plot()
        #centroids_frame.drop(range(20, 4096), axis=1, inplace=True)
        #print(centroids_frame.head())

        self.plot_data(canvas, clothing_frame, 'b')
        pyplot.show()
Esempio n. 2
0
def getIndexChangeRate(startDate,endDate):    
    df_result = DataFrame()
    df = ts.get_hist_data('sh',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sh'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('sz',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sz'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('zxb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'zxb'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('cyb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'cyb'
    df_result = df_result.append(df)
    
    fileName = r'D:\stock\index_changeRate_' +startDate+'_' + endDate + '.csv'
    df_result = df_result.loc[:,['date','mkt','close','volume','price_change','p_change','gap','gap_rate']]
    df_result = df_result.sort_index(by='date',ascending=False)
    df_result.to_csv(fileName,index = False)
Esempio n. 3
0
class Record(object):
    def __init__(self):
        self.trade_history = DataFrame()
        self.position_history = DataFrame()
        self.portfolio_value_history = DataFrame()

    def update_trade(self, date, trade_type, symbol, amount, price):
        newtrade = DataFrame(
            {"Date": [date], "Trade_type": [trade_type], "Symbol": [symbol], "Amount": [amount], "Price": [price]}
        )
        self.trade_history = self.trade_history.append(newtrade, ignore_index=True)

    def update_position(self, date, p):
        newposition = DataFrame(
            {
                "Date": [date],
                "Symbol": [p.symbol],
                "Amount": [p.amount],
                "Avg_price": [p.avg_price],
                "Position_value": [p.position_value],
            }
        )
        self.position_history = self.position_history.append(newposition, ignore_index=True)

    def update_portfolio_value(self, date, port, pos, cash):
        newport = DataFrame({"Date": [date], "Portfolio_value": [port], "Position_value": [pos], "Cash": [cash]})
        self.portfolio_value_history = self.portfolio_value_history.append(newport, ignore_index=True)
Esempio n. 4
0
	def scrab_one_user(self,uid,num):
		##登陆
		weiboLogin = WeiboLogin.WeiboLogin(self.username, self.pwd,self.header)
		weiboLogin.Login()

		##开始获取页面
		WBmsg = GetWeiboPage.getWeiboPage()
		WBmsg.body['uid'] = uid

		##构造微博数据存储结构
		wb_detail = []
		wb_all = {}
		wb_all = wb_all.fromkeys(wb_detail, [])
		wb_frame = DataFrame(wb_all, index=[])
		for n in range(1, num):
	        # 生成页面url地址
			url = 'http://weibo.com/' + uid + '?is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page=' + str(n)

			#print WBmsg.get_firstpage(url,n)
	        all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content(WBmsg.get_firstpage(url,n)))
	        wb_frame=wb_frame.append(all_weibo, ignore_index=True)
	        all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content( WBmsg.get_secondpage(url,n)))
	        wb_frame=wb_frame.append(all_weibo, ignore_index=True)
	        all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content(WBmsg.get_thirdpage(url,n)))
	        wb_frame=wb_frame.append(all_weibo, ignore_index=True)

	        print n
		return wb_frame
Esempio n. 5
0
class Append(object):

    goal_time = 0.2

    def setup(self):
        self.df1 = DataFrame(np.random.randn(10000, 4),
                             columns=['A', 'B', 'C', 'D'])
        self.df2 = self.df1.copy()
        self.df2.index = np.arange(10000, 20000)
        self.mdf1 = self.df1.copy()
        self.mdf1['obj1'] = 'bar'
        self.mdf1['obj2'] = 'bar'
        self.mdf1['int1'] = 5
        try:
            with warnings.catch_warnings(record=True):
                self.mdf1.consolidate(inplace=True)
        except:
            pass
        self.mdf2 = self.mdf1.copy()
        self.mdf2.index = self.df2.index

    def time_append_homogenous(self):
        self.df1.append(self.df2)

    def time_append_mixed(self):
        self.mdf1.append(self.mdf2)
Esempio n. 6
0
    def test_append_empty_dataframe(self):

        # Empty df append empty df
        df1 = DataFrame([])
        df2 = DataFrame([])
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Non-empty df append empty df
        df1 = DataFrame(np.random.randn(5, 2))
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Empty df with columns append empty df
        df1 = DataFrame(columns=['bar', 'foo'])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Non-Empty df with columns append empty df
        df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)
Esempio n. 7
0
    def get_endpoint_timeframe(self):
        result_df = DataFrame()
        res_len = self._make_req(self.chunk_start, self.chunk_end)
        self._wait_for_rate_limit()
        # check to see if there are possibly more results to get if close to max_result
        # this will make additional requests until either the results are smaller than 9k or the timeframe is 1day
        if res_len / self.max_results > .90:
            delta = self.chunk_end - self.chunk_start
            step_size = math.floor(delta.days / 2)
            self.chunk_end = self.chunk_start + timedelta(days=step_size)
            # if step is greater than a day make request
            if self.chunk_start != self.chunk_end:
                self.get_endpoint_timeframe()
            # if no step save data and just increment another day
            else:
                self.chunk_start = self.chunk_end + timedelta(days=1)
                self.chunk_end = self.chunk_end + timedelta(days=1)
                self.get_endpoint_timeframe()
                # parse & append results to dataframe
                df = self._parse_json()
                result_df = result_df.append(df)

        # pick up where we left off from chunking
        elif self.chunk_end != self.end_dt:
            self.chunk_start = self.chunk_end
            self.chunk_end = self.end_dt
            self.get_endpoint_timeframe()
            # parse & append results to dataframe
            df = self._parse_json()
            result_df = result_df.append(df)
        return result_df
Esempio n. 8
0
def get_sex_type():
    file_name = 'data/info_train.csv'
    y = pd.read_csv(file_name,header=None,index_col=0)
    male_id = y[y[1]<7].index
    m = DataFrame([0]*male_id.size,index=male_id,columns=['sex'])
    female_id = y[y[1]>6].index
    f = DataFrame([1]*female_id.size,index=female_id,columns=['sex'])
    m.append(f).to_csv('data/train_sex.csv')
    def get_topwords(self, countries, thresh=10, tf_idf=False):
        tw = DataFrame()
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    tw = tw.append(self.tf_idf.loc[r, :])
                else:
                    tw = tw.append(self.df.loc[r, :])

        return tw.mean().order(ascending=False)[:thresh]
Esempio n. 10
0
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
Esempio n. 11
0
    def test_append(self):
        begin_index = self.frame.index[:5]
        end_index = self.frame.index[5:]

        begin_frame = self.frame.reindex(begin_index)
        end_frame = self.frame.reindex(end_index)

        appended = begin_frame.append(end_frame)
        assert_almost_equal(appended['A'], self.frame['A'])

        del end_frame['A']
        partial_appended = begin_frame.append(end_frame)
        self.assertIn('A', partial_appended)

        partial_appended = end_frame.append(begin_frame)
        self.assertIn('A', partial_appended)

        # mixed type handling
        appended = self.mixed_frame[:5].append(self.mixed_frame[5:])
        assert_frame_equal(appended, self.mixed_frame)

        # what to test here
        mixed_appended = self.mixed_frame[:5].append(self.frame[5:])
        mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:])

        # all equal except 'foo' column
        assert_frame_equal(
            mixed_appended.reindex(columns=['A', 'B', 'C', 'D']),
            mixed_appended2.reindex(columns=['A', 'B', 'C', 'D']))

        # append empty
        empty = DataFrame({})

        appended = self.frame.append(empty)
        assert_frame_equal(self.frame, appended)
        self.assertIsNot(appended, self.frame)

        appended = empty.append(self.frame)
        assert_frame_equal(self.frame, appended)
        self.assertIsNot(appended, self.frame)

        # overlap
        self.assertRaises(ValueError, self.frame.append, self.frame,
                          verify_integrity=True)

        # new columns
        # GH 6129
        df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}})
        row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z')
        expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {
                             'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}})
        result = df.append(row)
        assert_frame_equal(result, expected)
Esempio n. 12
0
def concatenate_years_data():
    '''This function combines all of the dataframes
    for each year into one dataframe consisting
    of the births data from all of the years.'''
    years = np.array(range(1880,2011))  # These are all of the years for which we have data
    current_directory=os.getcwd()
    if current_directory!='C:\\Users\\Jormak\\PycharmProjects\\PANDAS_Book\\pydata-book\\ch02\\names':
        os.chdir('C:\\Users\\Jormak\\PycharmProjects\\PANDAS_Book\\pydata-book\\ch02\\names')
    all_years = DataFrame()
    for year in years:
        one_year = pd.read_csv('yob'+str(year)+'.txt',names = ['name','sex','births'])  #note that read_csv can read .txt files too
        all_years.append(one_year)
    names = pd.concat(all_years, ignore_index=True)
    return all_years
Esempio n. 13
0
    def test_append_missing_cols(self):
        # GH22252
        # exercise the conditional branch in append method where the data
        # to be appended is a list and does not contain all columns that are in
        # the target DataFrame
        df = DataFrame(np.random.randn(5, 4),
                       columns=['foo', 'bar', 'baz', 'qux'])

        dicts = [{'foo': 9}, {'bar': 10}]
        with tm.assert_produces_warning(None):
            result = df.append(dicts, ignore_index=True, sort=True)

        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        assert_frame_equal(result, expected)
Esempio n. 14
0
    def test_append_concat(self):
        rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)

        result = ts.append(ts)
        result_df = df.append(df)
        ex_index = DatetimeIndex(np.tile(rng.values, 2))
        tm.assert_index_equal(result.index, ex_index)
        tm.assert_index_equal(result_df.index, ex_index)

        appended = rng.append(rng)
        tm.assert_index_equal(appended, ex_index)

        appended = rng.append([rng, rng])
        ex_index = DatetimeIndex(np.tile(rng.values, 3))
        tm.assert_index_equal(appended, ex_index)

        # different index names
        rng1 = rng.copy()
        rng2 = rng.copy()
        rng1.name = 'foo'
        rng2.name = 'bar'
        assert rng1.append(rng1).name == 'foo'
        assert rng1.append(rng2).name is None
Esempio n. 15
0
def handleBi5(infile, fileDataFrame):

    if os.path.getsize(infile) == 0:
        return fileDataFrame

    array = infile.split('/')
    print array
    alen = len(array)

    dateWithoutHour = long(datetime(int(array[alen-4]),int(array[alen-3]),int(array[alen -2])).strftime("%s"))
    dateWithoutMilisec = (dateWithoutHour+int(array[alen-1].split('_')[0].split('h')[0])*3600)*1000
    subprocess.call("xz -dkc --suffix=bi5 " + infile + ">tmp.bin", shell=True)


    hdfDir = "./hdf/" + infile.split('/')[2]
    if not os.path.exists(hdfDir):
        os.makedirs(hdfDir)
    cvsFileName = hdfDir + "/" + infile.split('/')[3]

    if fileDataFrame.empty:
        if os.path.exists(cvsFileName):
            fileDataFrame =	read_csv(cvsFileName, index_col=0)
        else:
            fileDataFrame = DataFrame()

    fileDataFrame = fileDataFrame.append(processBinFile("tmp.bin", dateWithoutMilisec))

    print fileDataFrame.iloc[0]
    return fileDataFrame
Esempio n. 16
0
def read_data(features, feat_path='out'):
    frame = DataFrame()

    for data_path, data_ids in data_paths_and_ids:
        frame = frame.append(frame_for_id(features, feat_path, data_ids, data_path))

    return frame
Esempio n. 17
0
    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=['foo', 'bar', 'baz', 'qux'])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df.append(dicts, ignore_index=True)
        expected = df.append(df, ignore_index=True)
        assert_frame_equal(result, expected)

        # different columns
        dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
                 {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
        result = df.append(dicts, ignore_index=True, sort=True)
        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        assert_frame_equal(result, expected)
Esempio n. 18
0
class matchbox:
    def __init__(self, articlepaths):
        self.num_exports = 0
        self.num_articles_total = len(articlepaths)
        self.num_articles_matched = 0
        self.num_matches = 0
        self.dataframe = DataFrame()
        self.init_time = time.strftime("%Y-%m-%d_%H-%M-%S_")

    def update(self, matches):
        self.dataframe = self.dataframe.append(matches, ignore_index=True)
        self.num_articles_matched += 1
        self.num_matches += len(matches)
        print('Matched {} places in article {} of {} ({:.2%} complete). '
              'Total: {}.'.format(len(matches),
                                          self.num_articles_matched,
                                          self.num_articles_total,
                                          self.num_articles_matched / self.num_articles_total,
                                          self.num_matches))

    def empty_into_csv(self):
        self.num_exports += 1
        outname = outdir + self.init_time + 'pubs_aegypti_' + str(self.num_exports) + '.csv'
        self.dataframe.to_csv(outname, encoding='utf-8')
        print('Wrote matches from chunk {} to {}.'.format(self.num_exports, outname))
        del self.dataframe
        self.dataframe = DataFrame()
Esempio n. 19
0
def frame_for_id(features, feat_path='out', data_ids=sts.sts12.train_ids, data_dir='STS2012-train'):
    frame = DataFrame()

    for data_id in data_ids:
        data = {}

        for feat_id in features:
            data_id_dir = data_id[9:] if data_id.startswith("surprise.") else data_id
            feat_fn = os.path.join(feat_path, data_dir, data_id_dir, "%s.txt" % feat_id)

            data[feat_id] = series_from_feat(feat_fn)

        new_frame = DataFrame(data)
        new_frame['data_id'] = data_id

        gs_fn = os.path.join(repos_dir, 'data', data_dir, "STS.gs.%s.txt" % data_id)

        if os.path.exists(gs_fn):
            new_frame['gs'] = Series(loadtxt(gs_fn))
        else:
            new_frame['gs'] = None

        frame = frame.append(new_frame)

    frame['data_set'] = data_dir

    return frame
Esempio n. 20
0
def _extract_data(file_name, filters, fields=None, summary=None,
                  classname='Table', mode='walk', hash=''):
    '''
    Not meant for direct use.  This is broken out of :func:`extract_data` so we
    can wrap the code in a caching decorator to speed up loading of data from
    disk.  The hash is created by :func:`extract_data` to ensure that the cache
    is cleared if the last modified time changes.  Note that if you move the
    file to a different folder, this does not clear the cache.
    '''
    log.info('... No cached copy of data found, reloading data')
    with tables.openFile(file_name, 'r') as h:
        data = DataFrame()
        if mode == 'walk':
            iterator = walk_nodes(h.root, filters, classname)
        elif mode == 'pattern':
            iterator = p_iter_nodes(h.root, filters)
        else:
            raise ValueError, 'Unsupported mode {}'.format(mode)

        for node in iterator:
            log.info('... Found node %s', node._v_pathname)
            if type(node) == tables.Table:
                frame = extract_node_data(node, fields, summary)
                data = data.append(frame, ignore_index=True)
            else:
                raise NotImplementedError
    return data
Esempio n. 21
0
def train_data_construct(bins, train_set, iteration, realtime = False):
    train_bins = defaultdict(tuple)

    print 'start to construct the train data bins'
    if realtime:
        idx = 0
        for bin in bins:
            if len(bin) > 0:
                feature_bin = DataFrame()
                lable_bin = Series()
                for uid in bin:
                    tmp = train_set[train_set['product_uid'] == int(uid)]
                    if not tmp.empty:
                        feature_bin = feature_bin.append(tmp)
                        # should drop the relevance data here
                        lable_bin = lable_bin.append(tmp['relevance'])
                train_bins[idx] = (feature_bin,lable_bin)
                print len(train_bins[idx][0]), ' entries in bin', idx
                # if idx == 0:
                #     feature_bin.to_csv('feature_bin.csv')
                idx += 1
        f1 = file('../data/train_bins'+str(iteration)+'.pkl','wb')
        pk.dump(train_bins,f1)
    else:
        f1 = file('../data/train_bins'+str(iteration)+'.pkl','rb')
        train_bins=pk.load(f1)
    print 'finish constructing training bins'

    return train_bins
Esempio n. 22
0
    def test_append_length0_frame(self):
        df = DataFrame(columns=['A', 'B', 'C'])
        df3 = DataFrame(index=[0, 1], columns=['A', 'B'])
        df5 = df.append(df3)

        expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C'])
        assert_frame_equal(df5, expected)
Esempio n. 23
0
def main():

    logger = get_root_logger()
    get_header(logger, 'LOADING PROJECTIONS')

    client = APIClient()

    # grab dataframe shape from a trial run
    data = client.get_data('weekly-projections', 'json', 'QB')
    test_df = json_normalize(data['Projections'])

    # get DF structure from columns in test_df
    cols = test_df.columns
    df = DataFrame(columns=cols)

    # grab current week
    current_week = test_df.week.values[0]

    # loop through all weeks up to current week
    for wk in [str(x) for x in range(int(current_week))]:
        logger.info('Processing projections for week {0}'.format(int(wk) + 1))
        # loop through all positions
        for pos in ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']:
            tmp_data = client.get_data('weekly-projections', 'json', pos, wk)
            tmp_df = json_normalize(tmp_data['Projections'])
            df = df.append(tmp_df)

    # import this df directly to PG DB
    conn = DBClient()
    conn.load(df, 'projections', schema='raw', if_exists='replace')
Esempio n. 24
0
    def test_append_dtype_coerce(self):

        # GH 4993
        # appending with datetime will incorrectly convert datetime64
        import datetime as dt
        from pandas import NaT

        df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),
                                            dt.datetime(2013, 1, 2, 0, 0)],
                        columns=['start_time'])
        df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0),
                                             dt.datetime(2013, 1, 3, 6, 10)],
                                            [dt.datetime(2013, 1, 4, 0, 0),
                                             dt.datetime(2013, 1, 4, 7, 10)]],
                        columns=['start_time', 'end_time'])

        expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10),
                                   dt.datetime(2013, 1, 4, 7, 10)],
                                  name='end_time'),
                           Series([dt.datetime(2013, 1, 1, 0, 0),
                                   dt.datetime(2013, 1, 2, 0, 0),
                                   dt.datetime(2013, 1, 3, 0, 0),
                                   dt.datetime(2013, 1, 4, 0, 0)],
                                  name='start_time')], axis=1)
        result = df1.append(df2, ignore_index=True)
        assert_frame_equal(result, expected)
Esempio n. 25
0
 def OnRtnTrade(self, Trade):
     """成交回报"""
     # print('OnRtnTrade:', Trade)
     print('OnRtnTrade:\n', Utils.code_transform(Trade))
     PyCTP_Trader_API.dfOnRtnTrade = DataFrame.append(PyCTP_Trader_API.dfOnRtnTrade,
                                                      other=Utils.code_transform(Trade),
                                                      ignore_index=True)
Esempio n. 26
0
def test(dataset, overshoot_threshold):
	from numpy import where, zeros
	from sklearn.neighbors.kde import KernelDensity
	folder = make_issue_specific_figure_folder('108 cluster after removing outliers', dataset)
	fit, par, gen, ids = IO.load_pickled_generation_dataframe(dataset)
	o = where(fit.overshoot > overshoot_threshold)[0]
	#not_o = where(fit.overshoot <= overshoot_threshold)[0]
	par = par.drop(o)
	fit = fit.drop(o)
	g1 = par.groupby('ssmm_nAgents').groups.keys()
	g2 = par.groupby('ssmm_latency_mu').groups.keys()
	#stdev_mean = zeros((len(g1), len(g2)))
	data = DataFrame(columns=['ssmm_nAgents', 'ssmm_latency_mu', 'stdev_mean'])
	for a, ssmm_nAgents in enumerate(g1):
		print ssmm_nAgents
		for l, ssmm_latency_mu in enumerate(g2):
			row = dict()
			try:
				row['stdev_mean'] = fit[(par['ssmm_latency_mu'] == ssmm_latency_mu) & (par['ssmm_nAgents'] == ssmm_nAgents)]['stdev'].mean()
				row['ssmm_nAgents'] = ssmm_nAgents
				row['ssmm_latency_mu'] = ssmm_latency_mu
				#print row
				data = data.append(row, ignore_index = True)
			except TypeError:
				print "ARGHS"

	X, Y = np.meshgrid(g1.groups.keys(), g2.groups.keys())
	xy = np.vstack([Y.ravel(), X.ravel()]).T
	return data
Esempio n. 27
0
def getFeatures(filename):
    csvfile = pd.read_csv(filename)  # Reading .csv files containing tweets.
    tweet_ids = csvfile["id_str"]  # Copying the 'id_str' attribute values to a item.
    length = len(tweet_ids)  # Getting the length of 'tweet_ids'.

    df = DataFrame(d, index=[0])  # Creating a DataFrame

    twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
    ACCESS_TOKEN = twitter.obtain_access_token()
    twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)
    # Generating Access Token

    for i in range(0, length):
        status = twitter.show_status(id=tweet_ids[i])
        d["id"] = status["id_str"].encode("utf-8")
        d["created_at"] = status["created_at"].encode("utf-8")
        d["from_user"] = status["user"]["screen_name"].encode("utf-8")
        d["followers_count"] = status["user"]["followers_count"]
        d["friends_count"] = status["user"]["friends_count"]
        d["statuses_count"] = status["user"]["statuses_count"]
        d["verified"] = status["user"]["verified"]
        d["location"] = 0 if (len(status["user"]["location"].encode("utf-8")) == 0) else 1
        d["text"] = status["text"].encode("utf-8")
        d["retweet_count"] = status["retweet_count"]
        d["favorite_count"] = status["favorite_count"]
        d["hashtag_count"] = len(status["entities"]["hashtags"])
        d["url_count"] = len(status["entities"]["urls"])
        d["mentions_count"] = len(status["entities"]["user_mentions"])
        if len(status["entities"]["urls"]) > 0:
            for x in range(0, len(status["entities"]["urls"])):
                d["links"] += status["entities"]["urls"][x]["expanded_url"].encode("utf-8") + "  "
        df = df.append(d, ignore_index=True)
        df.to_csv("NSamples.csv")  # Saving file to disk
        d["links"] = ""
    print "\nAll Done!"
def prepare_modeling_results(model,all_data,expnoList,path,exptitle):
	results_table = DataFrame(columns=['h','X','X model','S', 'S model', 'P', 'P model', 'expno', 'XnOBS', 'XnPRED', 'SnOBS', 'SnPRED', 'PnOBS', 'PnPRED'])
	for dataset1,expno1 in zip(all_data,expnoList):
		results_table1 = model.simulation(dataset1, expno=expno1)
		# calculate normalized values
		# XnOBS, XnPRED = feature_scaling(results_table1['X'].values,results_table1['X model'].values)
		# SnOBS, SnPRED = feature_scaling(results_table1['S'].values,results_table1['S model'].values)
		# PnOBS, PnPRED = feature_scaling(results_table1['P'].values,results_table1['P model'].values)
		# standardization
		XnOBS, XnPRED = zero_mean_variance(results_table1['X'].values,results_table1['X model'].values)
		SnOBS, SnPRED = zero_mean_variance(results_table1['S'].values,results_table1['S model'].values)
		PnOBS, PnPRED = zero_mean_variance(results_table1['P'].values,results_table1['P model'].values)

		
		# and add them to the table as new columns
		results_table1['XnOBS']  = XnOBS
		results_table1['XnPRED'] = XnPRED
		results_table1['SnOBS']  = SnOBS
		results_table1['SnPRED'] = SnPRED
		results_table1['PnOBS']  = PnOBS
		results_table1['PnPRED'] = PnPRED
		# now add the current experiment to the big table of all experiments
		results_table = results_table.append(results_table1)
	results_table.to_html("{0}results_table_{1}.html".format(path,exptitle))
	return results_table
Esempio n. 29
0
def parse_page(html):
	#html解析
	soup = BeautifulSoup(html,"lxml")
	#提取微博文本
	text=soup.find_all(attrs={"node-type":"feed_list_content","class":"WB_text W_f14"})
	#提取转发部分
	forward=soup.find_all(attrs={"node-type":"forward_btn_text"})
	#提取评论部分
	comment=soup.find_all(attrs={"node-type":"comment_btn_text"})
	#提取日期
	date=soup.find_all(attrs={"node-type":"feed_list_item_date"})
	#提取来源平台
	source=soup.find_all(attrs={"action-type":"app_source"})
	#提取点赞数
	like=soup.select('li a[title="赞"]')
	#删除无关信息
	for each in date:
		if each.has_attr('suda-data')==False:
			date.remove(each)
	for each in source:
		if each.has_attr('suda-uatrack')==False:
			source.remove(each)
	for each in like:
		if each.has_attr('suda-uatrack')==False:
			like.remove(each)
	#构建数据字典
	wb_de=[]
	wb_al={}
	wb_al=wb_al.fromkeys(wb_de,[])
	wb_fr=DataFrame(wb_al,index=[])
	for i in range(len(text)):
		all_weibo={"text":text_list(text)[i],"date":text_list(date)[i],"source":text_list(source)[i],"forward":text_list(forward)[i],"comment":text_list(comment)[i],"like":text_list(like)[i]}
		wb_fr=wb_fr.append(all_weibo,ignore_index=True)
	    
	return wb_fr
Esempio n. 30
0
def convertToPutJson(csv_file):
    df = cleanColumns(read_csv(csv_file))
    putColumns = ["method", "recordId", "body"]
    putDf = DataFrame(columns = putColumns)

    for recordId in df.index:
        print "Converting data for recordId {recordId}...".format(recordId = recordId)
        body = {}
        
        for col in df.columns:
            body[str(col).strip()] = [str(df[col][recordId]).strip()]
        
        putDfRow = DataFrame([["PUT", str(recordId), body]], columns = putColumns)
        putDf = putDf.append(putDfRow)
    
    json_file = sub("csv|txt", "json", csv_file)
    putDf.to_json(json_file, orient="records")

    with open(json_file, 'r') as target:
        putData = target.read()

    target = open(json_file, 'w')
    putData = putData.replace("},{", "}\n\n{")[1:-1]
    target.write(putData)
    target.close()

    print "Successfully created put data!"
    return json_file
Esempio n. 31
0
df[df['target'] == 1].shape[0]

####imbalanced data!!!

#import pandas_ml as pdml
import random
from pandas import Series, DataFrame

d_class0 = df[df['target'] == 0]
d_class1 = df[df['target'] == 1]

numRows_class0 = len(d_class0.index)
numRows_class1 = len(d_class1.index)

# downsample the class 0

d_class0_downsampled = d_class0.sample(n=numRows_class1,
                                       replace=False,
                                       random_state=42)

# new output data frame containing 1:1 class ratios

data_set = DataFrame()
data_set = data_set.append(d_class0_downsampled)
data_set = data_set.append(d_class1)

# shuffle the rows

numRows_data_set = len(data_set.index)
data_set = data_set.sample(n=numRows_data_set, replace=False)
    cre_path = tkinter.filedialog.askopenfilenames()
root.destroy()
# for root, dirs, files in os.walk(r''+cre_path+''):
#     pass
files=[]
for i in cre_path:
    files.append(i.split('/')[-1])

#####################################################
if cam_tye=='1':
    if cre_frmat=='1':
        # same+video
        # 复制行
        new_data=DataFrame()
        for i in range(len(files)):
            new_data=new_data.append(raw_data)
            
        # Creative Type 赋值
        new_data['Creative Type']='Video Page Post Ad'
        
        for file_index in range(len(files)):
            # creative name 每step行数 = file
            new_data['Video File Name'][row_num*file_index:row_num*(file_index+1)]=files[file_index]

            # campaign name= oldcampaign name + creatvie name
            new_data['Campaign Name'][row_num*file_index:row_num*(file_index+1)]=new_data['Campaign Name'][row_num*file_index:row_num*(file_index+1)]+'_'+files[file_index]

        # ad_set name = old ad_set name + creatvie name
        new_data['Ad Set Name']=new_data['Ad Set Name']+'_'+new_data['Video File Name']

        # ad name = creative name
Esempio n. 33
0
def test_drop_duplicates():
    pdf = DataFrame(
        {
            "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
            "C": [1, 1, 2, 2, 2, 2, 1, 2],
            "D": range(8),
        }
    )
    gdf = cudf.DataFrame.from_pandas(pdf)
    # single column
    result = gdf.copy()
    result.drop_duplicates("AAA", inplace=True)
    expected = pdf.copy()
    expected.drop_duplicates("AAA", inplace=True)
    assert_df(result, expected)

    result = gdf.drop_duplicates("AAA", keep="last")
    expected = pdf.drop_duplicates("AAA", keep="last")
    assert_df(result, expected)

    result = gdf.drop_duplicates("AAA", keep=False)
    expected = pdf.drop_duplicates("AAA", keep=False)
    assert_df(result, expected)
    assert len(result) == 0

    # multi column
    expected = pdf.loc[[0, 1, 2, 3]]
    result = gdf.drop_duplicates(np.array(["AAA", "B"]))
    assert_df(result, expected)
    result = pdf.drop_duplicates(np.array(["AAA", "B"]))
    assert_df(result, expected)

    result = gdf.drop_duplicates(("AAA", "B"), keep="last")
    expected = pdf.drop_duplicates(("AAA", "B"), keep="last")
    assert_df(result, expected)

    result = gdf.drop_duplicates(("AAA", "B"), keep=False)
    expected = pdf.drop_duplicates(("AAA", "B"), keep=False)
    assert_df(result, expected)

    # consider everything
    df2 = gdf.loc[:, ["AAA", "B", "C"]]

    result = df2.drop_duplicates()
    # in this case only
    expected = df2.drop_duplicates(["AAA", "B"])
    assert_df(result, expected)

    result = df2.drop_duplicates(keep="last")
    expected = df2.drop_duplicates(["AAA", "B"], keep="last")
    assert_df(result, expected)

    result = df2.drop_duplicates(keep=False)
    expected = df2.drop_duplicates(["AAA", "B"], keep=False)
    assert_df(result, expected)

    # integers
    result = gdf.drop_duplicates("C")
    expected = pdf.drop_duplicates("C")
    assert_df(result, expected)
    result = gdf.drop_duplicates("C", keep="last")
    expected = pdf.drop_duplicates("C", keep="last")
    assert_df(result, expected)

    gdf["E"] = gdf["C"].astype("int8")
    result = gdf.drop_duplicates("E")
    pdf["E"] = pdf["C"].astype("int8")
    expected = pdf.drop_duplicates("E")
    assert_df(result, expected)
    result = gdf.drop_duplicates("E", keep="last")
    expected = pdf.drop_duplicates("E", keep="last")
    assert_df(result, expected)

    pdf = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
    gdf = cudf.DataFrame.from_pandas(pdf)
    assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

    pdf = DataFrame([[1, 0], [0, 2]])
    gdf = cudf.DataFrame.from_pandas(pdf)
    assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

    pdf = DataFrame([[-2, 0], [0, -4]])
    gdf = cudf.DataFrame.from_pandas(pdf)
    assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

    x = np.iinfo(np.int64).max / 3 * 2
    pdf = DataFrame([[-x, x], [0, x + 4]])
    gdf = cudf.DataFrame.from_pandas(pdf)
    assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

    pdf = DataFrame([[-x, x], [x, x + 4]])
    gdf = cudf.DataFrame.from_pandas(pdf)
    assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())

    pdf = DataFrame([i] * 9 for i in range(16))
    pdf = pdf.append([[1] + [0] * 8], ignore_index=True)
    gdf = cudf.DataFrame.from_pandas(pdf)
    assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
Esempio n. 34
0
def test_drop_duplicates():
    df = DataFrame(
        {
            "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
            "C": [1, 1, 2, 2, 2, 2, 1, 2],
            "D": range(8),
        }
    )
    # single column
    result = df.drop_duplicates("AAA")
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("AAA", keep="last")
    expected = df.loc[[6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates("AAA", keep=False)
    expected = df.loc[[]]
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    expected = df.loc[[0, 1, 2, 3]]
    result = df.drop_duplicates(np.array(["AAA", "B"]))
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates(["AAA", "B"])
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(("AAA", "B"), keep="last")
    expected = df.loc[[0, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(("AAA", "B"), keep=False)
    expected = df.loc[[0]]
    tm.assert_frame_equal(result, expected)

    # consider everything
    df2 = df.loc[:, ["AAA", "B", "C"]]

    result = df2.drop_duplicates()
    # in this case only
    expected = df2.drop_duplicates(["AAA", "B"])
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep="last")
    expected = df2.drop_duplicates(["AAA", "B"], keep="last")
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep=False)
    expected = df2.drop_duplicates(["AAA", "B"], keep=False)
    tm.assert_frame_equal(result, expected)

    # integers
    result = df.drop_duplicates("C")
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates("C", keep="last")
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    df["E"] = df["C"].astype("int8")
    result = df.drop_duplicates("E")
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates("E", keep="last")
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    # GH 11376
    df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
    expected = df.loc[df.index != 3]
    tm.assert_frame_equal(df.drop_duplicates(), expected)

    df = DataFrame([[1, 0], [0, 2]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-2, 0], [0, -4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    x = np.iinfo(np.int64).max / 3 * 2
    df = DataFrame([[-x, x], [0, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-x, x], [x, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    # GH 11864
    df = DataFrame([i] * 9 for i in range(16))
    df = df.append([[1] + [0] * 8], ignore_index=True)

    for keep in ["first", "last", False]:
        assert df.duplicated(keep=keep).sum() == 0
Esempio n. 35
0
    def test_append_series_dict(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=["foo", "bar", "baz", "qux"])

        series = df.loc[4]
        msg = "Indexes have overlapping values"
        with pytest.raises(ValueError, match=msg):
            df.append(series, verify_integrity=True)

        series.name = None
        msg = "Can only append a Series if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df.append(series, verify_integrity=True)

        result = df.append(series[::-1], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1]
        }, index=df.columns).T,
                             ignore_index=True)
        tm.assert_frame_equal(result, expected)

        # dict
        result = df.append(series.to_dict(), ignore_index=True)
        tm.assert_frame_equal(result, expected)

        result = df.append(series[::-1][:3], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1][:3]
        }).T,
                             ignore_index=True,
                             sort=True)
        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        msg = "Can only append a dict if ignore_index=True"
        with pytest.raises(TypeError, match=msg):
            df.append(series.to_dict())

        # can append when name set
        row = df.loc[4]
        row.name = 5
        result = df.append(row)
        expected = df.append(df[-1:], ignore_index=True)
        tm.assert_frame_equal(result, expected)
Esempio n. 36
0
fid = "stn_gpe_beta"
dv = 'p'
ivs = ["tau_e", "tau_p", "tau_ampa_r","tau_ampa_d", "tau_gabaa_r", "tau_gabaa_d", "tau_stn", "eta", "delta", "k",
       "eta_e", "eta_p", "k_pe", "k_ep", "k_pp"]

# load data into frame
df = DataFrame(data=np.zeros((1, len(ivs))), columns=ivs)
df_dv = DataFrame(data=np.zeros((1, 1)), columns=["fitness"])
for d in directories:
    for fn in os.listdir(d):
        if fn.startswith(fid) and fn.endswith('.h5'):
            f = h5py.File(f"{d}/{fn}", 'r')
            index = int(fn.split('_')[-2])
            if fn.endswith("params.h5"):
                df_tmp = DataFrame(data=np.asarray([[f[dv][key][()] for key in ivs]]), columns=ivs, index=[index])
                df = df.append(df_tmp)
            elif fn.endswith("fitness.h5"):
                df_tmp = DataFrame(data=np.asarray([1/f["f"][()]]), columns=["fitness"], index=[index])
                df_dv = df_dv.append(df_tmp)
df = df.iloc[1:, :]
df_dv = df_dv.iloc[1:, :]
#df['fitness'] = df_dv.loc[:, "fitness"]

# create feature matrix and target vector
y = np.squeeze(df_dv.values)
X = np.asarray([df.pop(iv) for iv in ivs]).T

# perform dimensionality reduction on data
# n_comps = 5
# dim_red = Isomap(n_components=n_comps, n_neighbors=10, p=2)
# X_ld = dim_red.fit_transform(X, y)
Esempio n. 37
0
len(db)
db[0].keys()
db[0]['nutrients'][0]
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]

info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)
info[:5]
info
info.group.value_counts()

nutrients = []
for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)
nutrients
nutrients.duplicated().sum()
nutrients = nutrients.drop_duplicates()
col_mapping = {'description': 'food', 'group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
info

ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata
ndata.ix[30000]

result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    def checkChangeTriggerResult(owner, repo):
        """检查PRChangeTrigger是否计算完整"""
        """在切换代理的时候,数据库连接会断开,导致comments信息查不到,会遗漏review comment的情况"""
        """这里检查一遍pr的change_trigger里是否有review_comment数据,如果没有,重新获取一次"""

        """PRTimeLine表头"""
        PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node",
                                     "comment_type", "change_trigger", "filepath"]
        """初始化目标文件"""
        target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_pr_change_trigger.tsv'

        """1. 获取该仓库所有的pr_node"""
        # repo_fullname = configPraser.getOwner() + "/" + configPraser.getRepo()
        # pr_nodes = AsyncProjectAllDataFetcher.getPullRequestNodes(repo_fullname)
        # pr_nodes = list(pr_nodes)
        # pr_nodes = [node[0] for node in pr_nodes]
        """需要获取的prs改为有issue 额 review的timeline的pr"""
        timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv'
        timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0)
        timeline_df = timeline_df.loc[(timeline_df['typename'] == 'IssueComment') \
                                      | (timeline_df['typename'] == 'PullRequestReview')].copy(deep=True)
        pr_nodes = list(set(timeline_df['pullrequest_node']))

        """2. 读取pr_change_trigger文件"""
        change_trigger_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv'
        change_trigger_df = pandasHelper.readTSVFile(fileName=change_trigger_filename, header=0)
        change_nodes = list(set(change_trigger_df['pullrequest_node']))

        # """3. 读取pr_timeline文件"""
        # timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_prtimeline.tsv'
        # timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0)

        """4. 将change_trigger按照pull_request_node分组"""
        grouped_timeline = change_trigger_df.groupby((['pullrequest_node']))
        """5. 分析pullrequest_node的change_trigger信息是否完整,整理出需要重新获取的pr信息"""
        re_analyze_prs = [x for x in pr_nodes if x not in change_nodes]
        # for pr, group in grouped_timeline:
        #     if pr not in pr_nodes:
        #         re_analyze_prs.append(pr)
        #     else:
        #         review_comment_trigger = group.loc[(group['comment_type'] == StringKeyUtils.STR_LABEL_REVIEW_COMMENT) & (group['change_trigger'] >= 0)]
        #         if review_comment_trigger is None or review_comment_trigger.empty:
        #             re_analyze_prs.append(pr)
        # Logger.logi("there are {0} prs need to re analyze".format(re_analyze_prs.__len__()))

        """读取PullRequestData,获取pr所对应的作者"""
        pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv'
        pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        """收集pr已经对应的作者  用于后面过滤属于作者评论"""
        pr_author_map = {}
        for index, row in pr_data_df.iterrows():
            pr_author_map[row['node_id']] = row['user_login']


        """设置fetch参数"""
        pos = 0
        fetchLimit = 200
        size = re_analyze_prs.__len__()
        while pos < size:
            Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size))
            sub_re_analyze_prs = re_analyze_prs[pos:pos + fetchLimit]
            """6. 重新获取这些pr的timeline"""
            re_analyze_prs_timeline_df = timeline_df[timeline_df['pullrequest_node'].isin(sub_re_analyze_prs)]
            grouped_timeline = re_analyze_prs_timeline_df.groupby((['pullrequest_node']))
            formated_data = []
            for pr, group in grouped_timeline:
                formated_data.append(group.to_dict(orient='records'))

            """7. 开始分析"""
            pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data, pr_author_map)
            pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y]

            """8. 将分析结果去重并追加到change_trigger表中"""
            if pr_change_trigger_comments is not None and pr_change_trigger_comments.__len__() > 0:
                target_content = DataFrame()
                target_content = target_content.append(pr_change_trigger_comments, ignore_index=True)
                target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True)
                target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first')
                if not target_content.empty:
                    pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW,
                                              header=pandasHelper.INT_WRITE_WITHOUT_HEADER)
            Logger.logi("successfully analyzed {0} prs".format(re_analyze_prs.__len__()))
            pos += fetchLimit
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content


SAD = 'Sad'
HAPPY = 'Happy'

SOURCES = [(r'/Users/Rini/Documents/training_happy_sad/happy', HAPPY),
           (r'/Users/Rini/Documents/training_happy_sad/sad', SAD),
           (r'/Users/Rini/Documents/validation_happy_sad/happy', HAPPY),
           (r'/Users/Rini/Documents/validation_happy_sad/sad', SAD)]

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))
data = data.reindex(numpy.random.permutation(data.index))
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['text'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

pipeline = Pipeline([('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf_transformer', TfidfTransformer()),
                     ('classifier', MultinomialNB())])

from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
Esempio n. 40
0
#Only keep US locations for merge and rename variables
ihme_locations = ihme_locations[(
    ihme_locations['region_name'] == str('High-income North America'))]
ihme_locations = ihme_locations.loc[:,
                                    ['location_name', 'ihme_loc_id'
                                     ]]  #Only keep variables needed for merge
ihme_locations = ihme_locations.rename(columns={'location_name': 'area_title'})

#Loop through years to produce individually cleaned datasets from BLS. Will append them all together at end
for year in range(1990, 2015):
    QCEW = QCEW.append(
        pd.read_csv(open(
            r'C:/Users/strUser/Work/Data/QCEW/{date}.annual.singlefile.csv'.
            format(date=year)),
                    usecols=[
                        "area_fips", "industry_code", "annual_avg_emplvl",
                        "own_code", "year"
                    ],
                    dtype={'area_fips': np.str}))

#Clean all datasets before merging
##Hold onto only relevant variables for GBD

QCEW = QCEW[(QCEW.own_code == 0) | (
    (QCEW.own_code == 5) & (QCEW.industry_code != '10')
)]  #Drops distinctions the QCEW study made concerning industry types
QCEW = QCEW.drop('own_code', axis=1)

#Add on human readable names
QCEW = pd.merge(QCEW, industry_titles, on='industry_code')
Esempio n. 41
0
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)


data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('/emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('/emails/ham', 'ham'))

#Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)
#let's test it
examples = [
    'Free Chocolates now!!!', "Hi Bob, how about a game of golf tomorrow?"
]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
print predictions
Esempio n. 42
0
               index=['a', 'b', 'c', 'd'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

obj = Series([1, -2, 3, -4], index=[0, 2, 3, 5])
obj2 = obj.reindex(range(6), method='ffill')
df = DataFrame(np.arange(9).reshape(3, 3),
               index=['a', 'c', 'd'],
               columns=['name', 'id', 'se'])
df2 = df.reindex(['a', 'b', 'c', 'd'])
df3 = df.reindex(columns=['name', 'year', 'id'], fill_value=0)
data2 = {'name': ['张三', '李四', '王五', '小明'], 'grade': [68, 78, 63, 92]}
df = DataFrame(data2)
df2 = df.sort_values(by='grade')

new_data = {'city': '武汉', 'name': '小李', 'sex': 'male', 'year': 2002}
df = df.append(new_data, ignore_index=True)  #或略索引值
'''
  name  grade city   sex    year
0   张三   68.0  NaN   NaN     NaN
1   李四   78.0  NaN   NaN     NaN
2   王五   63.0  NaN   NaN     NaN
3   小明   92.0  NaN   NaN     NaN
4   小李    NaN   武汉  male  2002.0
'''

# 增加列
df['class'] = 2018
'''
  name  grade city   sex    year  class
0   张三   68.0  NaN   NaN     NaN   2018
1   李四   78.0  NaN   NaN     NaN   2018
Esempio n. 43
0
    def _forcedPhotometry(
            self, objects: pandas.DataFrame, latest_objects: pandas.DataFrame,
            dt: DateTime,
            visit_id: int) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
        """Do forced photometry on latest_objects which are not in objects.

        Extends objects catalog with new DiaObjects.

        Parameters
        ----------
        objects : `pandas.DataFrame`
            Catalog containing DiaObject records
        latest_objects : `pandas.DataFrame`
            Catalog containing DiaObject records
        dt : `DateTime`
            Visit time.
        visit_id : `int`
            Visit ID.
        """

        midPointTai = dt.get(system=DateTime.MJD)

        if objects.empty:
            return pandas.DataFrame(
                columns=["diaObjectId", "ccdVisitId", "midPointTai", "flags"
                         ]), objects

        # Ids of the detected objects
        ids = set(objects['diaObjectId'])

        # do forced photometry for all detected DiaObjects
        df1 = pandas.DataFrame({
            "diaObjectId": objects["diaObjectId"],
            "ccdVisitId": visit_id,
            "midPointTai": midPointTai,
            "flags": 0,
        })

        # do forced photometry for non-detected DiaObjects (newer than cutoff)
        o1 = cast(pandas.DataFrame,
                  latest_objects[~latest_objects["diaObjectId"].isin(ids)])

        # only do it for 30 days after last detection
        cutoff = dt.toPython() - timedelta(days=self.config.forced_cutoff_days)
        o1 = cast(pandas.DataFrame, o1[o1["lastNonForcedSource"] > cutoff])

        if o1.empty:
            catalog = df1
        else:
            df2 = pandas.DataFrame({
                "diaObjectId": o1["diaObjectId"],
                "ccdVisitId": visit_id,
                "midPointTai": midPointTai,
                "flags": 0,
            })

            # extend forced sources
            catalog = pandas.concat([df1, df2])

            # also extend objects
            o2 = pandas.DataFrame({
                "diaObjectId": o1["diaObjectId"],
                "ra": o1["ra"],
                "decl": o1["decl"],
            })
            objects = objects.append(o2)

        return catalog, objects
Esempio n. 44
0
 def test_other_dtypes(self, data, dtype):
     df = DataFrame(data, dtype=dtype)
     result = df.append(df.iloc[0]).iloc[-1]
     expected = Series(data, name=0, dtype=dtype)
     tm.assert_series_equal(result, expected)
Esempio n. 45
0
    def compute_entrainment(self, ctfind=False):
        ''' This function computes the histogram and average w for use in our computation of the w'phi' calculation

        Here we compute the histogram that will allow us to average w over the whole bin for use in the RMS calculation
        bin frequency is set by h_freq usually 2Hz which corresponds to around _______ vertical change. '''

        # These three structures will be our outputs
        # This one is for full diagnostics (Note Eflux includes only qt here)
        ent_data = DataFrame(columns=['leg', 'eflux', 'dphi', 'wprime'])
        # This one holds the ent flux for each leg
        ent_flux = {p: [] for p in self.phi_raw.keys()}
        # This one is for the average flux, used in budgeting
        avg_flux = {p: None for p in self.phi_raw.keys()}

        # hist. frequency
        dt = self.const['hist_width']
        # hist. dimensions
        # averaged v_velocity data structure
        flight_legs = {leg: None for leg in self.ein_b.keys()}
        # this structure will hold the (w', zbar) mapping for each leg of the flight. it's primary use is for visuals
        wp_mapping = {}
        # Here we'll determine the cloud-top buffer zone and qt flux jump for each of the flight legs
        # Load the
        ctop_fname = "/home/mrmisanthropy/Projects/fase/fase/flights/{}/CloudtopHeights_{}.json".format(
            self.date, self.date)
        try:
            cloud_top = read_json(ctop_fname, orient='index')
        except FileNotFoundError:
            print("Ct file not found, run ct determination?")
            raise FileNotFoundError

        # Here's a quick way check of our ct determination
        # for leg in flight_legs.keys():
        #     leg_data = self.en_cbfile[self.ein_b[leg]:self.ein_e[leg]]
        #     ct_entry = cloud_top.loc[leg, :]
        #     self.ct_check(leg_data, ct_entry, leg, save=True, show=False)

        for leg in flight_legs.keys():
            # Take a slice of the data for the leg
            leg_data = self.en_cbfile[self.ein_b[leg]:self.ein_e[leg]]

            # Create histogram of arrival times for averaging
            ts_data = leg_data['time']
            dim = {'dn': dt}
            time_series = create_histogram(dim, ts_data)
            leg_map = []
            for i in range(time_series['dim']['N']):
                ind_list = time_series['index'][i]
                events = leg_data[leg_data.index.isin(ind_list)]
                wbar = np.average(events['w'])
                zbar = np.average(events['alt'])
                w_prime_sq = np.average((events['w'] - wbar)**2)
                wprime = np.sqrt(w_prime_sq)
                leg_map.append((wprime, zbar))
            leg_map = DataFrame(leg_map, columns=["w_prime", "alt"])
            wp_mapping[leg] = leg_map
            # Now using our estimation of cloudtop, compute wprime at the altitude of the center of the buffer layer
            leg_ctop = cloud_top.loc[leg, :]
            z_top = leg_ctop['cld_bin'][1]
            wp = leg_map.iloc[(leg_map['alt'] - z_top).abs().argsort()[0]]

            # This bit computes only for qt, its used for diagnostics
            leg_eflux = {}
            leg_eflux['leg'] = leg
            leg_eflux['eflux'] = wp['w_prime'] * leg_ctop['Dphi']['qt']
            leg_eflux['dphi'] = leg_ctop['Dphi']['qt']
            leg_eflux['wprime'] = wp['w_prime']
            ent_data = ent_data.append(Series(leg_eflux), ignore_index=True)

            # Here we append the ent flux value for each phi variable
            for ph in self.phi_raw.keys():
                ent_flux[ph].append(wp['w_prime'] * leg_ctop['Dphi'][ph])

        # Here we average each ent flux
        for ph in self.phi_raw.keys():
            avg_flux[ph] = np.average(ent_flux[ph])

        return avg_flux, ent_flux, ent_data
Esempio n. 46
0
        #
        # X_important_train = sfm.transform(X_train)
        # X_important_test = sfm.transform(x_test)
        #
        # clf_important = RandomForestClassifier(n_estimators=5000, max_depth=5, min_samples_leaf=10)
        #
        # # Train the new classifier on the new dataset containing the most important features
        # clf_important.fit(X_important_train, y_train)
        predictions = model.predict(x_test)

        report = classification_report(y_test, predictions, output_dict=True)

        acc = report['accuracy']
        f1sc = report['weighted avg']['f1-score']

        # calculate and write the mean  and std_dev of the average & f1-score
        df_all = df_all.append(
            {
                'channel': channel,
                'segment': 'spontaneous&stimulus',
                'accuracy': acc,
                'f1-score': f1sc
            },
            ignore_index=True)

        # print('debug')
    df_all.to_csv(csv_file, mode='a', header=False)
    df_all = df_all.iloc[0:0]

write_file.close()
Esempio n. 47
0
    return df


if __name__ == "__main__":
    CMC_URL = "https://coinmarketcap.com/"
    CSV_PATH = "/CoinMarketCapData.csv"

    response = requests.get(CMC_URL + "historical/")
    soup = BeautifulSoup(response.content, "lxml")

    df = DataFrame()

    for partial_historical_link in get_historical_links(soup):
        historical_link = CMC_URL + partial_historical_link

        try:
            response = requests.get(historical_link)
            soup = BeautifulSoup(response.content, "lxml")

            new_df = parse_tables(soup)
            date = parse_date(soup)

            new_df['date'] = date
            df = df.append(new_df)

            print("Downloaded: " + date)
        except:
            print("ERROR: Unable to parse " + historical_link)

        df.to_csv(CSV_PATH, index=False)
Esempio n. 48
0
def getCorrelation(input_image, angles, dist):
    """

    Loops through a list of given angles, finding the correlation value at each angle.

    :param input_image: a single grayscale image to analyze
    :param angles: a list of angles in radians (usually from 0 to pi)  to measure correlation at
    :param dist: a single distance parameter to measure correlation at
    :return: An array called "all_corr" containn the correlation values at every angle.
    """

    final_ds = DataFrame(columns=[
        'img_id', 'mean', 'std', 'kurtosis', 'skew', 'entropy', 'contrast',
        'dissimilarity', 'energy', 'ASM', 'homogeneity', 'correlation'
    ])

    img_id = 0
    image = input_image
    all_corr = []

    for angle in angles:
        glcm = greycomatrix(image=image,
                            distances=[dist],
                            angles=[angle],
                            levels=256,
                            symmetric=True,
                            normed=True)

        t = {
            'img_id': [img_id],
            'mean': [numpy.average(image)],
            'std': [numpy.std(image)],
            'kurtosis': [kurtosis(image.flatten())],
            'skew': [skew(image.flatten())],
            'entropy': [shannon_entropy(glcm, base=numpy.e)],
            'contrast': [greycoprops(glcm, 'contrast')[0, 0]],
            'dissimilarity': [greycoprops(glcm, 'dissimilarity')[0, 0]],
            'energy': [greycoprops(glcm, 'energy')[0, 0]],
            'ASM': [greycoprops(glcm, 'ASM')[0, 0]],
            'homogeneity': [greycoprops(glcm, 'homogeneity')[0, 0]],
            'correlation': [greycoprops(glcm, 'correlation')[0, 0]]
        }

        t = DataFrame(data=t,
                      columns=[
                          'img_id', 'mean', 'std', 'kurtosis', 'skew',
                          'entropy', 'contrast', 'dissimilarity', 'energy',
                          'ASM', 'homogeneity', 'correlation'
                      ])

        img_id += 1
        final_ds = final_ds.append(t)

        corr = greycoprops(glcm, 'correlation')[0, 0]
        all_corr.append(corr)

        #Printing a bunch of data from the glcm
        print(final_ds)

        # printing the angle & corresponding correlation:
        print("Angle = ", angle, "Correlation = ",
              greycoprops(glcm, 'correlation')[0, 0])

    return all_corr
Esempio n. 49
0
student.ix['제임스','영어'] = 98  # 수정
student.loc['제임스','영어']
student

'''
student = student.set_value('제임스','영어',90)  # 수정
student
'''

# append : DataFrame에 DataFrame을 rbind 시키기 1
student_new = DataFrame([[60,80,70],[50,75,85],[90,80,85]],
                    index = ['윤건','김건모','이문세'],
                    columns = ['영어','수학','국어'])
student_new

student = student.append(student_new)  # student에 student_new 붙이기(R : rbind())
student


# pd.concat : DataFrame에 DataFrame을 rbind 시키기 2
student1 = DataFrame([[60,80,70],[50,75,85],[90,80,85]],
                     index = ['싸이','나얼','윤상'],
                     columns = ['영어','수학','국어'])
student1

student = pd.concat([student,student1])  # student에 student1 붙이기(R : rbind())
student


## row,column 추가/삭제
Esempio n. 50
0
    "time":
    Series([
        3.14, 4.12, 5.44, 7.69, 10.57, 13.81, 2.49, 3.48, 4.73, 7.09, 9.95,
        13.17, 2.48, 3.42, 4.8, 6.95, 9.82, 13.06
    ],
           dtype="float32"),
    "operation":
    Series([op for op in ["INSERT", "DELETE", "LOOKUP"] for _ in range(6)],
           dtype="category")
})

all_bench_df = bst_unsafe_running_times_df.append(
    avl_unsafe_running_times_df, ignore_index=True).append(
        bst_fullextern_compilation_times_df, ignore_index=True).append(
            avl_fullextern_compilation_times_df, ignore_index=True).append(
                bst_extern_compilation_times_df, ignore_index=True).append(
                    avl_extern_compilation_times_df, ignore_index=True).append(
                        bst_intern_compilation_times_df,
                        ignore_index=True).append(
                            avl_intern_compilation_times_df, ignore_index=True)
all_bench_df = all_bench_df.astype(
    {
        "bench_name": "category",
        "bench_type": "category",
        "N": "int8",
        "size": "int32",
        "time": "float32",
        "operation": "category"
    },
    copy=True)
Esempio n. 51
0
forest = forest.fit(train_independent_vars, train_dependent_vars)

# Take the same decision trees and run it on the test data
output = forest.predict(train_imputed[[
    'Ticket_length', 'Title', 'NameLength', 'Pclass', 'Female', 'Age',
    'withfamily', 'Ticket_group', 'Fare', 'Embarked', 'Cabin_first_ltr',
    'spaces_in_name'
]])

### combine the passengerid with the prediction
output_df = pd.DataFrame(test_imputed.PassengerId).join(pd.DataFrame(output))
output_df.columns = ['PassengerId', 'Survived']
#### create the final output dataframe
final_output = DataFrame(columns=['PassengerId', 'Survived'])
final_output = final_output.append(output_df[['PassengerId', 'Survived']])
#### convert to csv
final_output.to_csv('output.csv',
                    index=False,
                    header=['PassengerId', 'Survived'])

# In[ ]:

#
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in indices:
Esempio n. 52
0
def runtdm(infile_):
    print "running the code for ", infile_
    tag_str = infile_.split("_13TeV")[0].split("Merged_")[1]

    cross_section_weight = xsweight(tag_str)

    print infile_, tag_str, cross_section_weight

    ### dataframe for output
    df_out = DataFrame(columns=[
        'run', 'lumi', 'event', 'MET', 'MT', 'Njets_PassID', 'Nbjets_PassID',
        'ElePt', 'EleEta', 'ElePhi', 'Jet1Pt', 'Jet1Eta', 'Jet1Phi', 'Jet2Pt',
        'Jet2Eta', 'Jet2Phi', 'Jet3Pt', 'Jet3Eta', 'Jet3Phi', 'Jet1Idx',
        'Jet2Idx', 'Jet3Idx', 'weight'
    ])
    df_out_wmunu_cr = DataFrame(columns=[
        'run', 'lumi', 'event', 'MET', 'MT', 'Njets_PassID', 'Nbjets_PassID',
        'Jet1Pt', 'Jet1Eta', 'Jet1Phi', 'Jet2Pt', 'Jet2Eta', 'Jet2Phi',
        'Jet3Pt', 'Jet3Eta', 'Jet3Phi', 'Jet1Idx', 'Jet2Idx', 'Jet3Idx',
        'MuPt', 'MuEta', 'MuPhi', 'weight'
    ])

    recoil_den = TH1F("recoil_den", "recoil_den", 100, 0.0, 1000.)
    recoil_num = TH1F("recoil_num", "recoil_num", 100, 0.0, 1000.)

    jetvariables = [
        'st_THINnJet', 'st_THINjetPx', 'st_THINjetPy', 'st_THINjetPz',
        'st_THINjetEnergy', 'st_THINjetCISVV2', 'st_THINjetHadronFlavor',
        'st_THINjetNHadEF', 'st_THINjetCHadEF', 'st_THINjetCEmEF',
        'st_THINjetPhoEF', 'st_THINjetEleEF', 'st_THINjetMuoEF',
        'st_THINjetCorrUnc', 'st_runId', 'st_lumiSection', 'st_eventId',
        'st_pfMetCorrPt', 'st_pfMetCorrPhi', 'st_pfMetUncJetResUp',
        'st_pfMetUncJetResDown', 'st_pfMetUncJetEnUp', 'st_pfMetUncJetEnDown',
        'st_isData', 'st_HLT_IsoMu24_v', 'st_HLT_IsoTkMu24_v',
        'st_HLT_IsoMu27_v', 'st_HLT_IsoTkMu27_v', 'st_HLT_Ele27_WPTight_Gsf_v',
        'st_HLT_Ele27_WPLoose_Gsf_v', 'st_HLT_Ele105_CaloIdVT_GsfTrkIdT_v',
        'st_HLT_Ele115_CaloIdVT_GsfTrkIdT_v', 'st_HLT_Ele32_WPTight_Gsf_v',
        'st_HLT_Ele32_eta2p1_WPTight_Gsf_v',
        'st_HLT_Ele27_eta2p1_WPTight_Gsf_v', 'st_THINnJet', 'st_THINjetPx',
        'st_THINjetPy', 'st_THINjetPz', 'st_THINjetEnergy', 'st_THINjetCISVV2',
        'st_THINjetHadronFlavor', 'st_THINjetNHadEF', 'st_THINjetCHadEF',
        'st_THINjetCEmEF', 'st_THINjetPhoEF', 'st_THINjetEleEF',
        'st_THINjetMuoEF', 'st_THINjetCorrUnc', 'st_nEle', 'st_elePx',
        'st_elePy', 'st_elePz', 'st_eleEnergy', 'st_eleIsPassLoose',
        'st_eleIsPassTight', 'st_nMu', 'st_muPx', 'st_muPy', 'st_muPz',
        'st_muEnergy', 'st_isTightMuon', 'st_muIso', 'st_HPSTau_n'
    ]

    filename = infile_
    ''' global variables, mainly to be stored in the new root tree for quick analysis or histo saving '''

    icount = 0

    df_new = DataFrame()

    df_all = DataFrame()

    ieve = 0

    jetptseries = []
    jetetaseries = []
    jetphiseries = []
    jet_pt30 = []
    jet_pt50 = []
    jet_eta4p5 = []
    jet_IDtightVeto = []
    jet_eta2p4 = []
    jet_NJpt30 = []
    jet_NJpt30_Eta4p5 = []
    jet_csvmedium = []
    jet_N_bmedium_eta2p4_pt30 = []

    hlt_ele = []

    met_250_ = []
    for df in read_root(filename, columns=jetvariables, chunksize=125000):
        icount = icount + 1
        ''' all the operations which should be applied to each event must be done under this loop, 
        otherwise effect will be reflected on the last chunk only. Each chunck can be considered 
        as a small rootfile. An example of how to add new variable and copy the new dataframe into 
        a bigger dataframe is shown below. This is the by far fastest method I manage to find, uproot
        awkward arrays are even faster but difficult to use on lxplus. and may be on condor.  '''
        for nak4jet_, ak4px_, ak4py_, ak4pz_, ak4e_, ak4csv, ak4flavor, ak4NHEF, ak4CHEF, ak4CEmEF, ak4PhEF, ek4EleEF, ak4MuEF, ak4JEC, hlt_ele27, hlt_ele105, hlt_ele115, hlt_ele32, hlt_ele32_eta2p1, hlt_ele27_eta2p1, nele_, elepx_, elepy_, elepz_, elee_, elelooseid_, eletightid_, nmu_, mupx_, mupy_, mupz_, mue_, mutightid_, muIso_, met_, metphi_, run, lumi, event, nTau in zip(
                df.st_THINnJet, df.st_THINjetPx, df.st_THINjetPy,
                df.st_THINjetPz, df.st_THINjetEnergy, df.st_THINjetCISVV2,
                df.st_THINjetHadronFlavor, df.st_THINjetNHadEF,
                df.st_THINjetCHadEF, df.st_THINjetCEmEF, df.st_THINjetPhoEF,
                df.st_THINjetEleEF, df.st_THINjetMuoEF, df.st_THINjetCorrUnc,
                df.st_HLT_Ele27_WPLoose_Gsf_v,
                df.st_HLT_Ele105_CaloIdVT_GsfTrkIdT_v,
                df.st_HLT_Ele115_CaloIdVT_GsfTrkIdT_v,
                df.st_HLT_Ele32_WPTight_Gsf_v,
                df.st_HLT_Ele32_eta2p1_WPTight_Gsf_v,
                df.st_HLT_Ele27_eta2p1_WPTight_Gsf_v, df.st_nEle, df.st_elePx,
                df.st_elePy, df.st_elePz, df.st_eleEnergy,
                df.st_eleIsPassLoose, df.st_eleIsPassTight, df.st_nMu,
                df.st_muPx, df.st_muPy, df.st_muPz, df.st_muEnergy,
                df.st_isTightMuon, df.st_muIso, df.st_pfMetCorrPt,
                df.st_pfMetCorrPhi, df.st_runId, df.st_lumiSection,
                df.st_eventId, df.st_HPSTau_n):

            print "ievent = ", ieve

            ieve = ieve + 1
            if debug_: print nak4jet_, ak4px_, ak4py_, ak4pz_, ak4e_
            '''
            *******   *****   *******
               *      *          *
               *      ****       *
               *      *          *
            ***       *****      *
            '''
            ''' This small block compute the pt of the jet and add them back 
            into the original dataframe as a next step for further usage. '''
            ak4pt = [getPt(ak4px_[ij], ak4py_[ij]) for ij in range(nak4jet_)]
            jetptseries.append(ak4pt)
            ''' Jet Loose ID is already applied in the preselection  '''
            ''' eta and phi of the ak4 jets '''
            ak4eta = [
                getEta(ak4px_[ij], ak4py_[ij], ak4pz_[ij])
                for ij in range(nak4jet_)
            ]
            ak4phi = [getPhi(ak4px_[ij], ak4py_[ij]) for ij in range(nak4jet_)]

            jetetaseries.append(ak4eta)
            jetphiseries.append(ak4phi)
            ''' pT>30 GeV, |eta|<4.5 is already applied in the tuples '''
            ''' jets with pt > 30 GeV '''
            ak4_pt30 = [(ak4pt[ij] > 30.) for ij in range(nak4jet_)]
            jet_pt30.append(ak4_pt30)
            ''' jets with pt > 50 GeV '''
            ak4_pt50 = [(ak4pt[ij] > 50.) for ij in range(nak4jet_)]
            jet_pt50.append(ak4_pt50)
            ''' jet |eta| < 4.5  '''
            ak4_eta4p5 = [(abs(ak4eta[ij]) < 4.5) for ij in range(nak4jet_)]
            jet_eta4p5.append(ak4_eta4p5)
            ''' jet |eta| < 2.4 '''
            ak4_eta2p4 = [(abs(ak4eta[ij]) < 2.4) for ij in range(nak4jet_)]
            jet_eta2p4.append(ak4_eta2p4)
            ''' jet tightLeptonVeto ID to reject fake jets coming from the leptons, Veto ID should be applied only for jets within the detector abs(eta) < 2.4
            
            Following the syntax of if else in list comprehension 
            [f(x) if condition else g(x) for x in sequence]
            '''

            ak4_IDtightVeto = [
                ((ak4NHEF[ij] < 0.90) and (ak4PhEF[ij] < 0.90) and
                 (ak4MuEF[ij] < 0.8) and
                 (ak4CEmEF[ij] < 0.90) and abs(ak4eta[ij]) < 2.4) or
                ((ak4NHEF[ij] < 0.90) and (ak4PhEF[ij] < 0.90) and
                 (ak4MuEF[ij] < 0.8) and abs(ak4eta[ij]) < 2.7
                 and abs(ak4eta[ij]) > 2.4) if
                (abs(ak4eta[ij]) < 2.7) else True for ij in range(nak4jet_)
            ]
            jet_IDtightVeto.append(ak4_IDtightVeto)

            if debug_: print "ak4_IDtightVeto", ak4_IDtightVeto
            ''' njets passing jet pt > 30 and eta < 4.5 and Loose Jet ID '''
            jet_NJpt30.append(ak4_pt30.count(True))

            jet_NJpt30_Eta4p5.append(
                len([
                    ij for ij in range(nak4jet_)
                    if (ak4_eta4p5[ij] and ak4_pt50[ij])
                ]))

            ak4_csvmedium = [(ak4csv[ij] > 0.8484) for ij in range(nak4jet_)]
            jet_csvmedium.append(ak4_csvmedium)

            jet_N_bmedium_eta2p4_pt30.append(
                len([
                    ij for ij in range(nak4jet_) if ((ak4_eta2p4[ij]) and (
                        ak4_pt30[ij]) and (ak4_csvmedium[ij]))
                ]))
            '''
            
            ****   *      ****
            *      *      *
            ***    *      ***
            *      *      *
            ****   ****   ****
            
            the selection for the electron is done here, later the new branches are added to the dataframe. 
            
            '''
            ''' electron triggers '''
            hlt_ele.append(
                logical_OR([
                    hlt_ele27, hlt_ele105, hlt_ele115, hlt_ele32,
                    hlt_ele32_eta2p1, hlt_ele27_eta2p1
                ]))

            if debug_: print "event ------", event
            ''' get pt, eta, phi of electrons '''
            elept = [getPt(elepx_[ie], elepy_[ie]) for ie in range(nele_)]
            eleeta = [
                getEta(elepx_[ie], elepy_[ie], elepz_[ie])
                for ie in range(nele_)
            ]
            elephi = [getPhi(elepx_[ie], elepy_[ie]) for ie in range(nele_)]
            ''' electron pt and eta cut, tuples already have electron pT > 10 GeV and |eta|<2.5
            Veto electron ID is also applied on the electron at preselection level '''
            ele_pt10 = [(elept[ie] > 10) for ie in range(nele_)]
            ele_pt30 = [(elept[ie] > 30) for ie in range(nele_)]

            ele_IDLoose = [(elelooseid_[ie]) for ie in range(nele_)]
            ele_IDTight = [(eletightid_[ie]) for ie in range(nele_)]
            ele_eta2p1 = [(abs(eleeta[ie]) < 2.1) for ie in range(nele_)]
            ele_eta2p5 = [(abs(eleeta[ie]) < 2.5) for ie in range(nele_)]

            ele_pt10_eta2p5_vetoID = []
            if len(ele_pt10) > 0:
                ele_pt10_eta2p5_vetoID = logical_AND_List2(
                    ele_pt10, ele_eta2p5)

            if debug_: print "ele info"
            if debug_: print "pt, id eta =", ele_pt30, ele_IDTight, ele_eta2p1
            if debug_:
                for ie in range(nele_):
                    print elept[ie], eleeta[ie], eletightid_[ie], elepx_[
                        ie], elepy_[ie], elepz_[ie]
            '''
            
            **     *  *     *
            * *  * *  *     *
            *  *   *  *     *
            *      *  *     *
            *      *   ***** 
            
            the selection for the muon is done here, later the new columns are added to the dataframe for each of them. 
            
            '''
            ''' muon triggers '''
            ''' muon pt threshold and eta threshold, tuples already have muon pt > 10 and |eta| < 2.4 '''
            mupt = [getPt(mupx_[imu], mupy_[imu]) for imu in range(nmu_)]
            mueta = [
                getEta(mupx_[imu], mupy_[imu], mupz_[imu])
                for imu in range(nmu_)
            ]
            muphi = [getPhi(mupx_[imu], mupy_[imu]) for imu in range(nmu_)]
            ''' For vetoing in the electron region only Looose Mu ID and ISo with pt > 10 GeV is needed and is already applied in the skimmer '''
            mu_pt10 = [(mupt[imu] > 10.0) for imu in range(nmu_)]
            mu_pt30 = [(mupt[imu] > 30.0) for imu in range(nmu_)]
            mu_eta2p4 = [(abs(mueta[imu]) < 2.4) for imu in range(nmu_)]
            mu_IDTight = [mutightid_[imu] for imu in range(nmu_)]
            mu_IsoTight = [(muIso_[imu] < 0.15) for imu in range(nmu_)]

            mu_pt10_eta2p4_looseID = []
            if len(mu_pt10):
                mu_pt10_eta2p4_looseID = logical_AND_List2(mu_pt10, mu_eta2p4)
            ''' MET SELECTION '''
            met_250_.append(met_ > 250.0)
            ''' MT Calculation for electrons '''
            mt_ele = [
                getMT(elept[ie], met_, elephi[ie], metphi_)
                for ie in range(nele_)
            ]
            #mt_ele_.append(mt_ele)
            ''' MT Calculation for muons '''
            mt_mu = [
                getMT(mupt[imu], met_, muphi[imu], metphi_)
                for imu in range(nmu_)
            ]
            #mt_mu_.append(mt_mu)
            ''' 
            Event selection to count the number of events. 
            
            In simple terms, index() method finds the given element in a list and returns its position.
            However, if the same element is present more than once, index() method returns its smallest/first position. 
            And this is what I generally need for this code. But this fails when there is no element or no true in the list 
            
            This is complicated in first look but more usable. And it will be faster once I know how to flatten the dataset. 
            
            first elecment of output return by where is the location whre true is present, still not known how where actually work but this is the fastest method 
            e.g. 
            #if (len(ele_eta2p1)>0): ele_passlist = numpy.where(ele_eta2p1)[0]
            #print ele_passlist
            '''
            ''' take AND of all the electron cuts (just take the lists) '''
            ele_eta2p1_idT_pt30 = []
            if (len(ele_eta2p1) > 0):
                ele_eta2p1_idT_pt30 = logical_AND_List3(
                    ele_eta2p1, ele_IDTight, ele_pt30)
            ''' 
            > 0 means >= 1. The selection in the function is implemented like >= not >. Therefore pay attention when using this function. 
            The function also take care of the fact that the operation will be performed only when size of the list is >= N, where N is by default 0 and has to be provided 
            '''
            pass_ele_index = WhereIsTrue(ele_eta2p1_idT_pt30, 1)

            pass_veto_id_ele_index = WhereIsTrue(ele_pt10_eta2p5_vetoID, 1)

            mu_eta2p4_idT_pt30 = []
            if (len(mu_pt30) > 0):
                mu_eta2p4_idT_pt30 = logical_AND_List3(mu_pt30, mu_IDTight,
                                                       mu_IsoTight)

            pass_mu_index = WhereIsTrue(mu_eta2p4_idT_pt30, 1)

            ak4_pt30_eta4p5_IDL = []
            if len(ak4_pt30) > 0:
                ak4_pt30_eta4p5_IDL = logical_AND_List2(ak4_pt30, ak4_eta4p5)
            ''' we need at least 3 jets passing id, so we must ensure presene of 3 jets atleast '''
            pass_jet_index = WhereIsTrue(ak4_pt30_eta4p5_IDL, 3)
            ''' 
            
            All the object selection is done before this, 
            region specific cuts are here. 
            
            '''

            jetCleanAgainstEle = []
            for ijet in range(len(ak4_pt30_eta4p5_IDL)):
                pass_ijet_iele_ = []
                for iele in range(len(ele_pt10_eta2p5_vetoID)):
                    pass_ijet_iele_.append(
                        ak4_pt30_eta4p5_IDL[ijet]
                        & ele_pt10_eta2p5_vetoID[iele]
                        & (Delta_R(ak4eta[ijet], eleeta[iele], ak4phi[ijet],
                                   elephi[iele]) > 0.4))
                print "pass_ijet_iele_ = ", pass_ijet_iele_
                ## if the number of true is equal to length of vector then it is ok to keep this jet, otherwise this is not cleaned
                jetCleanAgainstEle.append(
                    len(WhereIsTrue(pass_ijet_iele_)) == len(pass_ijet_iele_))
                print "jetCleanAgainstEle = ", jetCleanAgainstEle

            jetCleanAgainstMu = []
            for ijet in range(len(ak4_pt30_eta4p5_IDL)):
                pass_ijet_imu_ = []
                for imu in range(len(mu_pt10_eta2p4_looseID)):
                    pass_ijet_imu_.append(
                        ak4_pt30_eta4p5_IDL[ijet] & mu_pt10_eta2p4_looseID[imu]
                        & (Delta_R(ak4eta[ijet], mueta[imu], ak4phi[ijet],
                                   muphi[imu]) > 0.4))
                ## if the number of true is equal to length of vector then it is ok to keep this jet, otherwise this is not cleaned
                print "pass_ijet_imu_ = ", pass_ijet_imu_
                jetCleanAgainstMu.append(
                    len(WhereIsTrue(pass_ijet_imu_)) == len(pass_ijet_imu_))
                print "jetCleanAgainstMu = ", jetCleanAgainstMu
            jetCleaned = logical_AND_List2(jetCleanAgainstEle,
                                           jetCleanAgainstMu)
            print "jetCleaned = ", jetCleaned

            print "nele, nmu = ", ele_pt10_eta2p5_vetoID, mu_pt10_eta2p4_looseID

            pass_jet_index_cleaned = []
            pass_jet_index_cleaned = WhereIsTrue(jetCleaned, 3)

            print "pass_jet_index_cleaned = ", pass_jet_index_cleaned

            ak4_bjetM_eta2p4 = []
            if len(ak4_csvmedium) > 0:
                ak4_bjetM_eta2p4 = logical_AND_List3(ak4_csvmedium, ak4_eta2p4,
                                                     jetCleaned)
            pass_bjetM_eta2p4_index = WhereIsTrue(ak4_bjetM_eta2p4, 1)

            j1idx = -1
            j2idx = -1
            j3idx = -1

            wenu_cr = False
            if len(pass_ele_index) > 0:
                eleidx = pass_ele_index[0]
                wenu_cr = logical_AND([
                    len(ele_pt10_eta2p5_vetoID) == 1,
                    len(pass_ele_index) == 1, nmu_ == 0, met_ > 250.0,
                    len(pass_jet_index_cleaned) >= 3,
                    len(pass_bjetM_eta2p4_index) == 0,
                    mt_ele[pass_ele_index[0]] < 160., (nTau == 0)
                ])
                if len(pass_jet_index_cleaned) >= 3:
                    j1idx = pass_jet_index_cleaned[0]
                    j2idx = pass_jet_index_cleaned[1]
                    j3idx = pass_jet_index_cleaned[2]

            if wenu_cr:
                df_out = df_out.append(
                    {
                        'run': run,
                        'lumi': lumi,
                        'event': event,
                        'MET': met_,
                        'MT': mt_ele[pass_ele_index[0]],
                        'Njets_PassID': len(pass_jet_index_cleaned),
                        'Nbjets_PassID': len(pass_bjetM_eta2p4_index),
                        'ElePt': elept[eleidx],
                        'EleEta': eleeta[eleidx],
                        'ElePhi': elephi[eleidx],
                        'Jet1Pt': ak4pt[j1idx],
                        'Jet1Eta': ak4eta[j1idx],
                        'Jet1Phi': ak4phi[j1idx],
                        'Jet2Pt': ak4pt[j2idx],
                        'Jet2Eta': ak4eta[j2idx],
                        'Jet2Phi': ak4phi[j2idx],
                        'Jet3Pt': ak4pt[j3idx],
                        'Jet3Eta': ak4eta[j3idx],
                        'Jet3Phi': ak4phi[j3idx],
                        'Jet1Idx': j1idx,
                        'Jet2Idx': j2idx,
                        'Jet3Idx': j3idx,
                        'weight': cross_section_weight
                    },
                    ignore_index=True)

                if debug_:
                    print "object info", wenu_cr, run, lumi, event, eleidx, elept[
                        eleidx], eleeta[eleidx], elephi[eleidx], j1idx, ak4pt[
                            j1idx], ak4eta[j1idx], ak4phi[j1idx], j2idx, ak4pt[
                                j2idx], ak4eta[j2idx], ak4phi[
                                    j2idx], j3idx, ak4pt[j3idx], ak4eta[
                                        j3idx], ak4phi[j3idx], met_, mt_ele[
                                            pass_ele_index[0]], [
                                                len(pass_ele_index) == 1,
                                                nmu_ == 0, met_ > 250.0,
                                                len(pass_jet_index) >= 3,
                                                len(pass_bjetM_eta2p4_index) ==
                                                0, mt_ele[pass_ele_index[0]] <
                                                160.
                                            ]
            ''' W mu nu CR '''
            j1idx = -1
            j2idx = -1
            j3idx = -1

            wmunu_cr = False
            if len(pass_mu_index) > 0:
                muidx = pass_mu_index[0]
                wmunu_cr = logical_AND([
                    len(mu_pt10_eta2p4_looseID) == 1,
                    len(pass_mu_index) == 1, nele_ == 0, met_ > 250.0,
                    len(pass_jet_index_cleaned) >= 3,
                    len(pass_bjetM_eta2p4_index) == 0, mt_mu[muidx] < 160.,
                    (nTau == 0)
                ])
                if len(pass_jet_index_cleaned) >= 3:
                    j1idx = pass_jet_index_cleaned[0]
                    j2idx = pass_jet_index_cleaned[1]
                    j3idx = pass_jet_index_cleaned[2]

                print "object info mu ", wmunu_cr, run, lumi, event, mupt[
                    muidx], mueta[muidx], muphi[muidx], j1idx, ak4pt[
                        j1idx], ak4eta[j1idx], ak4phi[j1idx], j2idx, ak4pt[
                            j2idx], ak4eta[j2idx], ak4phi[j2idx], j3idx, ak4pt[
                                j3idx], ak4eta[j3idx], ak4phi[
                                    j3idx], met_, mt_mu[pass_mu_index[0]], [
                                        len(pass_mu_index) == 1, nmu_ == 1,
                                        met_ > 250.0,
                                        len(pass_jet_index) >= 3,
                                        len(pass_bjetM_eta2p4_index) == 0,
                                        mt_mu[pass_mu_index[0]] < 160.
                                    ]
                if wmunu_cr:
                    df_out_wmunu_cr = df_out_wmunu_cr.append(
                        {
                            'run': run,
                            'lumi': lumi,
                            'event': event,
                            'MET': met_,
                            'MT': mt_mu[muidx],
                            'Njets_PassID': len(pass_jet_index_cleaned),
                            'Nbjets_PassID': len(pass_bjetM_eta2p4_index),
                            'Jet1Pt': ak4pt[j1idx],
                            'Jet1Eta': ak4eta[j1idx],
                            'Jet1Phi': ak4phi[j1idx],
                            'Jet2Pt': ak4pt[j2idx],
                            'Jet2Eta': ak4eta[j2idx],
                            'Jet2Phi': ak4phi[j2idx],
                            'Jet3Pt': ak4pt[j3idx],
                            'Jet3Eta': ak4eta[j3idx],
                            'Jet3Phi': ak4phi[j3idx],
                            'Jet1Idx': j1idx,
                            'Jet2Idx': j2idx,
                            'Jet3Idx': j3idx,
                            'MuPt': mupt[muidx],
                            'MuEta': mueta[muidx],
                            'MuPhi': muphi[muidx],
                            'weight': cross_section_weight
                        },
                        ignore_index=True)

        df_all = concat([df_all, df])

    if debug_: print df_out

    outputfilename = args.outputfile
    df_out.to_root(outputfilename, key='t_dm_wenucr')

    df_out_wmunu_cr.to_root(outputfilename, key='t_dm_wmunucr', mode='a')

    end = time.clock()
    print "%.4gs" % (end - start)
Esempio n. 53
0
precisiondf = DataFrame()
f1df= DataFrame()
recalldf = DataFrame()
accuracydf= DataFrame()

for i in range(100):
    np.random.seed(i)
    random_label = np.random.randint(0,4,size=len(test_set_true_label))
    precision = precision_score(test_set_true_label, random_label,labels=[0,1,2,3], average=None)
    recall = recall_score(test_set_true_label, random_label, labels=[0,1,2,3],average=None)
    f1 = f1_score(test_set_true_label, random_label, labels=[0,1,2,3],average=None)
    random_confusion_matrix = confusion_matrix(test_set_true_label, random_label, labels=[0, 1, 2, 3])
    random_confusion_matrix = random_confusion_matrix.astype('float') / random_confusion_matrix.sum(axis=1)[:, np.newaxis]
    accuracy = random_confusion_matrix.diagonal()
    precisiondf = precisiondf.append(pd.Series(precision), ignore_index=True)
    recalldf = recalldf.append(pd.Series(recall), ignore_index=True)
    f1df = f1df.append(pd.Series(f1), ignore_index=True)
    accuracydf = accuracydf.append(pd.Series(accuracy), ignore_index=True)

print("Accuracy Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(accuracydf[0])), mean(list(accuracydf[1])), mean(list(accuracydf[2])), mean(list(accuracydf[3])) ))
print("Precision Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(precisiondf[0])), mean(list(precisiondf[1])), mean(list(precisiondf[2])), mean(list(precisiondf[3])) ))
print("Recall Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(recalldf[0])), mean(list(recalldf[1])), mean(list(recalldf[2])), mean(list(recalldf[3])) ))
print("F1 Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(f1df[0])), mean(list(f1df[1])), mean(list(f1df[2])), mean(list(f1df[3])) ))
print()
print("Accuracy macro average: {}".format((mean(list(accuracydf[0]))+ mean(list(accuracydf[1]))+ mean(list(accuracydf[2]))+mean(list(accuracydf[3])))/4))
print("Precision macro average: {}".format((mean(list(precisiondf[0]))+ mean(list(precisiondf[1]))+ mean(list(precisiondf[2]))+mean(list(precisiondf[3])))/4))
print("Recall macro average: {}".format((mean(list(recalldf[0]))+ mean(list(recalldf[1]))+ mean(list(recalldf[2]))+mean(list(recalldf[3])))/4))
print("F1 macro average: {}".format((mean(list(f1df[0]))+ mean(list(f1df[1]))+ mean(list(f1df[2]))+mean(list(f1df[3])))/4))
test_frequency_percentage = test_set.groupby(test_set['Label']).size()/len(test_set)*100
print('Test Set Relative Label Frequency (%)')
Esempio n. 54
0
    # 다 정제가 되었으면 이제 내림차순으로 각각 store를 행별로 정렬 관련된 상위 6개만 일단 뽑아옴(6개 추천해준다는 뜻임)
    second = first[index].sort_values(ascending=False).head(6)
    sql = "select email from tripmall_db.user where id=" + str((index + 1))
    cursor.execute(sql)
    result = cursor.fetchall()
    selected = result[0].get('email')

    for index2 in range(0, first[0].size):
        # if문은 print 찍어 보면 keys()랑 values 부분 이해가 갈거임

        sql = "select email from tripmall_db.user where id=" + str(
            (second.keys()[index2] + 1))
        cursor.execute(sql)
        result = cursor.fetchall()
        recommended = result[0].get('email')
        if selected != recommended:
            new_row = {
                'selected_user': selected,
                'recommended_user': recommended,
                'similarity': second.values[index2]
            }
            recommendAuto = recommendAuto.append(new_row, ignore_index=True)
# print(recommendAuto)
engine = create_engine("mysql+mysqldb://root:" + "adminssafy" +
                       "@localhost:3306/tripmall_db",
                       encoding='utf-8')
conn = engine.connect()
recommendAuto.to_sql(name='usersimilaritybased',
                     con=engine,
                     if_exists='append')
print("finish")
Esempio n. 55
0
    "title", "url", "points", "num_comments", "author", "created_at_i",
    "objectID"
]

i = 0

while True:
    try:
        url = 'https://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=%s&numericFilters=created_at_i<%s' % (
            hitsPerPage, ts)
        req = urllib2.Request(url)
        response = urllib2.urlopen(req)
        data = json.loads(response.read())
        last = data["nbHits"] < hitsPerPage
        data = DataFrame(data["hits"])[requested_keys]
        df = df.append(data, ignore_index=True)
        ts = data.created_at_i.min()
        print i
        if (last):
            break
        time.sleep(3.6)
        i += 1

    except Exception, e:
        print e

df["title"] = df["title"].map(lambda x: x.translate(
    dict.fromkeys([
        0x201c, 0x201d, 0x2011, 0x2013, 0x2014, 0x2018, 0x2019, 0x2026, 0x2032
    ])).encode('utf-8').replace(',', ''))
df["created_at"] = df["created_at_i"].map(
def main():
    """
    Main function of the program. This function makes calls to other functions.
    :return: None
    """

    files = [
        'data/enron_with_categories/1',
        'data/enron_with_categories/2',
        'data/enron_with_categories/3',
        'data/enron_with_categories/4',
        'data/enron_with_categories/5',
        'data/enron_with_categories/6',
        'data/enron_with_categories/7',
        'data/enron_with_categories/8',
    ]

    data = DataFrame({'text': [], 'class': []})

    # create data frame
    for path in files:
        data = data.append(build_data_frame(path))

    # The classes as mentioned in the categories.txt file,
    # we ignore the third parameter in the cats file
    classes = ("1,1", "1,2", "1,3", "1,4", "1,5", "1,6", "1,7", "1,8", "2,1",
               "2,2", "2,3", "2,4", "2,5", "2,6", "2,7", "2,8", "2,9", "2,10",
               "2,11", "2,12", "2,13", "3,1", "3,2", "3,3", "3,4", "3,5",
               "3,6", "3,7", "3,8", "3,9", "3,10", "3,11", "3,12", "3,13",
               "4,1", "4,2", "4,3", "4,4", "4,5", "4,6", "4,7", "4,8", "4,9",
               "4,10", "4,11", "4,12", "4,13", "4,14", "4,15", "4,16", "4,17",
               "4,18", "4,19")

    # Create pipelines for each combination of text_extraction and classifier

    pipeline = Pipeline([('text_extraction', CountVectorizer(ngram_range=(2,
                                                                          2))),
                         ('classifier', OneVsRestClassifier(MultinomialNB()))])
    print("text_extraction: ", "CountVectorizer", "classifier:",
          "MultinomialNB")
    classify(pipeline, classes, data)

    pipeline = Pipeline([('text_extraction', CountVectorizer(ngram_range=(1,
                                                                          2))),
                         ('classifier', OneVsRestClassifier(LinearSVC()))])
    print("text_extraction: ", "CountVectorizer", "classifier:", "LinearSVC")
    classify(pipeline, classes, data)

    pipeline = Pipeline([
        ('text_extraction', CountVectorizer(ngram_range=(1, 2))),
        ('classifier', OneVsRestClassifier(KNeighborsClassifier()))
    ])
    print("text_extraction: ", "CountVectorizer", "classifier:",
          "KNeighborsClassifier")
    classify(pipeline, classes, data)

    pipeline = Pipeline([('text_extraction', TfidfVectorizer(ngram_range=(1,
                                                                          2))),
                         ('classifier', OneVsRestClassifier(MultinomialNB()))])
    print("text_extraction: ", "TfidfVectorizer", "classifier:",
          "MultinomialNB")
    classify(pipeline, classes, data)

    pipeline = Pipeline([('text_extraction', TfidfVectorizer(ngram_range=(1,
                                                                          2))),
                         ('classifier', OneVsRestClassifier(LinearSVC()))])
    print("text_extraction: ", "TfidfVectorizer", "classifier:", "LinearSVC")
    classify(pipeline, classes, data)

    pipeline = Pipeline([
        ('text_extraction', TfidfVectorizer(ngram_range=(1, 2))),
        ('classifier', OneVsRestClassifier(KNeighborsClassifier()))
    ])
    print("text_extraction: ", "TfidfVectorizer", "classifier:",
          "KNeighborsClassifier")
    classify(pipeline, classes, data)
Esempio n. 57
0
    def test_append_series_dict(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=['foo', 'bar', 'baz', 'qux'])

        series = df.loc[4]
        with tm.assert_raises_regex(ValueError,
                                    'Indexes have overlapping values'):
            df.append(series, verify_integrity=True)
        series.name = None
        with tm.assert_raises_regex(
                TypeError, 'Can only append a Series if '
                'ignore_index=True'):
            df.append(series, verify_integrity=True)

        result = df.append(series[::-1], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1]
        }, index=df.columns).T,
                             ignore_index=True)
        assert_frame_equal(result, expected)

        # dict
        result = df.append(series.to_dict(), ignore_index=True)
        assert_frame_equal(result, expected)

        result = df.append(series[::-1][:3], ignore_index=True)
        expected = df.append(DataFrame({
            0: series[::-1][:3]
        }).T,
                             ignore_index=True,
                             sort=True)
        assert_frame_equal(result, expected.loc[:, result.columns])

        # can append when name set
        row = df.loc[4]
        row.name = 5
        result = df.append(row)
        expected = df.append(df[-1:], ignore_index=True)
        assert_frame_equal(result, expected)
Esempio n. 58
0
set_index() 重新设置某列为索引
DataFrame.set_index(keys,
                    drop = True,
                    append = False,
                    inplace = False)
append为True保留原索引加新索引
drop为False保留被作为索引的列
inplace为true在原数据集上修改

reset_index还原索引,使索引变为默认的整型索引
df.reset_index(level = None, drop = False, inplace = False, col_level = 0, col_fill='')

合并结构相同的两个数据框
pandas.concat([dataFrame1, dataFrame2,...])
df.append(df2, ignore_index = True) #df2追加到df上
pandas.concat([df1, df2], ignore_index = True) #后者表示index即可顺延

同一个数据框中不同列进行合并
X = x1 + x2 +...

不同结构的数据框按照一定的条件进行匹配合并
表一:姓名、学号。表二:学号,导师
merge(x, y, left_on, right_on)#后二参数表示两个表中用于匹配的列

离差标准化:min-max标准化
X* = (x - min)/(max - min)

Z-score标准化
X* = (x - μ)/σ
Esempio n. 59
0

def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)


data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory(
    'C:/Users/Barath Tirumala/Desktop/DSC/DataScience/DataScience-Python3/emails/spam',
    'spam'),
                   sort=True)
data = data.append(dataFrameFromDirectory(
    'C:/Users/Barath Tirumala/Desktop/DSC/DataScience/DataScience-Python3/emails/ham',
    'ham'),
                   sort=True)

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
    def getPRChangeTriggerData(owner, repo):
        """ 根据
            ALL_{repo}_data_prtimeline.tsv
            获取pr change_trigger数据
        """
        AsyncApiHelper.setRepo(owner, repo)
        """PRTimeLine表头"""
        PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node",
                                     "comment_type", "change_trigger", "filepath"]
        """初始化目标文件"""
        target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv'
        target_content = DataFrame(columns=PR_CHANGE_TRIGGER_COLUMNS)
        # pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW,
        #                           header=pandasHelper.INT_WRITE_WITH_HEADER)

        """读取PRTimeline,获取需要分析change_trigger的pr列表"""
        pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv'
        pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename,
                                                  header=pandasHelper.INT_READ_FILE_WITH_HEAD)

        """读取PullRequestData,获取pr所对应的作者"""
        pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv'
        pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD)
        """收集pr已经对应的作者  用于后面过滤属于作者评论"""
        pr_author_map = {}
        for index, row in pr_data_df.iterrows():
            pr_author_map[row['node_id']] = row['user_login']

        pr_nodes = list(set(list(pr_timeline_df['pullrequest_node'])))
        pr_nodes.sort()
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjE5MjEzOTc5']  # 3次reopend
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjA0MTk5ODkw']
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDQwOTAxMzk0']
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MzE1OTU0NDgw']  # pr外review
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTQ3NDczNTIx']  # 普通用例
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDM4NjAzMjk2']  # 超多review
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0Mjg1NzExNTIx']
        # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTAxNTUwMTcw']

        """设置fetch参数"""
        pos = 0
        fetchLimit = 400
        size = pr_nodes.__len__()
        Logger.logi("there are {0} prs need to analyze".format(pr_nodes.__len__()))
        t1 = datetime.now()

        while pos < size:
            print("now:", pos, ' total:', size, 'cost time:', datetime.now() - t1)
            Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size))

            """按照爬取限制取子集"""
            sub_prs = pr_nodes[pos:pos + fetchLimit]
            pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'].isin(sub_prs)]
            """对子集按照pull_request_node分组"""
            grouped_timeline = pr_timeline_items.groupby((['pullrequest_node']))
            """将分组结果保存为字典{pr->pr_timeline_items}"""
            formated_data = []
            for pr, group in grouped_timeline:
                record = group.to_dict(orient='records')
                record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION)))
                formated_data.append(record)

            """分析这些pr的timeline"""
            pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data,
                                                                                             pr_author_map)
            pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y]

            """将分析结果去重并追加到change_trigger表中"""
            if pr_change_trigger_comments.__len__() > 0:
                target_content = DataFrame()
                target_content = target_content.append(pr_change_trigger_comments, ignore_index=True)
                target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True)
                target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first')
                if not target_content.empty:
                    pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW,
                                              header=pandasHelper.INT_WRITE_WITHOUT_HEADER)
                Logger.logi("successfully analyzed {0} prs".format(pos))
            pos += fetchLimit