コード例 #1
0
def getSJSuggestion(jobPoolLocal):
    global userDF, appDF, jobDF, jobPool, liveJobDF, liveJobDict, LiveJobs, JobUserSparseMatrix, LiveJobUserSparseMatrix
    jobPool = jobPoolLocal
    st = datetime.now()
    userDF = pd.DataFrame({'userId': appDF['userId'].unique()})
    appDF['userLookUp'] = pd.match(appDF['userId'], userDF['userId'])
    appDF['jobLookUp'] = pd.match(appDF['jobId'], jobDF['jobId'])
    appDF['liveJobLookUp'] = pd.match(appDF['jobId'], liveJobDF['jobId'])
    row = appDF['jobLookUp']
    col = appDF['userLookUp']
    data = np.repeat(1,appDF.shape[0])
    JobUserSparseMatrix = sp.coo_matrix((np.array(data), (np.array(row),np.array(col))), shape=(jobDF.shape[0], userDF.shape[0])) # 30L x 5L
    del row, col, data
    JobUserSparseMatrix = JobUserSparseMatrix.tocsr()
    row = appDF[~(appDF.liveJobLookUp == -1)]['liveJobLookUp']
    col = appDF[~(appDF.liveJobLookUp == -1)]['userLookUp']
    data = np.repeat(1,appDF[~(appDF.liveJobLookUp == -1)].shape[0])
    LiveJobUserSparseMatrix = sp.coo_matrix((np.array(data), (np.array(row),np.array(col))), shape=(liveJobDF.shape[0], userDF.shape[0])) # 30L x 1L
    del row, col, data
    LiveJobUserSparseMatrix = LiveJobUserSparseMatrix.tocsr()
    liveJobDict = dict(zip(liveJobDF.index, liveJobDF['jobId']))
    collectionJS.drop()
    collectionJSExport.drop()
    print 'JobSuggestions Count:', collectionJS.count(), 'JobSuggestionsExport Count:', collectionJSExport.count() 
    st1 = datetime.now()
    for jobPosition in range(0, JobUserSparseMatrix.shape[0], jobPool):
        getSJSuggestionPoolWise(jobPosition)
    print datetime.now() - st1
    print 'JobSuggestions Count:', collectionJS.count(), 'JobSuggestionsExport Count:', collectionJSExport.count()
    print "Run time :" + str(datetime.now() - st)  # 10 min
コード例 #2
0
ファイル: entropy.py プロジェクト: tayoshan/IpythonNotebooks
def balanceFactors(data, sep, cost, factors, constraints, model):
    its = 0
    cnvg = 1
    while cnvg > .0001:
        its = its + 1
        if model != 'attConstrained':
            calcAi(data, sep, cost, factors, model)
            AiBF = (data.groupby(data[constraints['production']].name).aggregate({"Ai": np.sum}))
            AiBF["Ai"] = 1/AiBF["Ai"]
            updates = AiBF.ix[pd.match(data[constraints['production']], AiBF.index), "Ai"]
            data["Ai"] = updates.reset_index(level=0, drop=True) if(updates.notnull().any()) else data["Ai"]
            if model == 'prodConstrained':
                break
            if its == 1:
                data["OldAi"] = data["Ai"]
            else:
                data["diff"] = abs((data["OldAi"] - data["Ai"])/data["OldAi"])
                data["OldAi"] = data["Ai"]

        if model != 'prodConstrained':
            calcBj(data, sep, cost, factors, model)
            BjBF = data.groupby(data[constraints['attraction']].name).aggregate({"Bj": np.sum})
            BjBF["Bj"] = 1/BjBF["Bj"]
            updates = BjBF.ix[pd.match(data[constraints['attraction']], BjBF.index), "Bj"]
            data["Bj"] = updates.reset_index(level=0, drop=True) if(updates.notnull().any()) else data["Bj"]
            if its == 1:
                if model == 'attConstrained':
                    break
                data["OldBj"] = data["Bj"]
            else:
                data["diff"] = abs((data["OldBj"] - data["Bj"])/data["OldBj"])
                data["OldBj"] = data["Bj"]
        cnvg = np.sum(data["diff"])
        #print cnvg, its
    return data
コード例 #3
0
    def _load_data(self, x1, x2):
        final_data = np.zeros((x1.num_row, x2.num_col))
        final_row_labels = x1.row_labels
        final_col_labels = x2.col_labels

        prior_data = np.genfromtxt(self.data_file)
        prior_row_labels = self.prior_row_labels.tolist()
        assert prior_data.shape[0] == len(prior_row_labels)
        prior_col_labels = self.prior_col_labels.tolist()
        assert prior_data.shape[1] == len(prior_col_labels)

        prior_row_match_ind = pd.match(prior_row_labels, final_row_labels)
        prior_rows_to_transfer = [
            el for el in range(len(prior_row_labels))
            if prior_row_match_ind[el] != -1
        ]
        final_rows_to_fill = prior_row_match_ind[prior_row_match_ind != -1]
        prior_col_match_ind = pd.match(prior_col_labels, final_col_labels)
        prior_cols_to_transfer = [
            el for el in range(len(prior_col_labels))
            if prior_col_match_ind[el] != -1
        ]
        final_cols_to_fill = prior_col_match_ind[prior_col_match_ind != -1]
        final_data[np.ix_(final_rows_to_fill, final_cols_to_fill)] = \
            prior_data[prior_rows_to_transfer,:][:,prior_cols_to_transfer]

        self.row_labels = final_row_labels
        self.col_labels = final_col_labels

        self.data = final_data
コード例 #4
0
ファイル: data.py プロジェクト: nickmcadden/Kaggle
def category_transformation(train_categoric,
                            test_categoric,
                            labels,
                            type='std'):

    if type == 'freq':
        print("Encoding categories by freqency rank...")
        for c in train_categoric.columns:
            freqs = train_categoric[c].append(test_categoric[c]).value_counts()
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          freqs[0:91].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         freqs[0:91].index)

    if type == 'std':
        print("Encoding categories by sklearn label encoder...")
        for c in train_categoric.columns:
            lbl = LabelEncoder()
            lbl.fit(
                list(train_categoric.ix[:, c]) + list(test_categoric.ix[:, c]))
            train_categoric.ix[:, c] = lbl.transform(train_categoric.ix[:, c])
            test_categoric.ix[:, c] = lbl.transform(test_categoric.ix[:, c])

    if type == 'tgtrate':
        print("Encoding categories by target rate...")
        for c in train_categoric.columns:
            train_categoric[c], test_categoric[c] = category_to_prob_weight(
                train_categoric, test_categoric, c, labels)

    if type == 'rank':
        print("Encoding categories by rank transformation...")
        for c in train_categoric.columns:
            rank = pd.concat([train_categoric[c], labels],
                             axis=1).groupby(c).mean().sort_values(
                                 by='target', ascending=False)
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          rank[0:20000].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         rank[0:20000].index)

    if type == 'onehot':
        print("One hot... ")
        for c in train_categoric.columns:
            uniques = np.unique(train_categoric[c])
            if len(uniques) > 100:
                train_categoric.drop(c, axis=1, inplace=True)
                test_categoric.drop(c, axis=1, inplace=True)
        x_cat_train = train_categoric.T.to_dict().values()
        x_cat_test = test_categoric.T.to_dict().values()

        # vectorize
        vectorizer = DV(sparse=False)
        train_categoric = pd.DataFrame(vectorizer.fit_transform(x_cat_train))
        test_categoric = pd.DataFrame(vectorizer.transform(x_cat_test))

    return train_categoric, test_categoric
コード例 #5
0
def balanceFactors(data, sep, cost, factors, constraints, model):
    """
    calculate balancing factors and balance the balancing factors if doubly constrained model
    """
    its = 0
    cnvg = 1
    while cnvg > .0001:
        its = its + 1
        #If model is prod or doubly constrained
        if model != 'attConstrained':
            calcAi(data, sep, cost, factors, model)
            AiBF = (data.groupby(
                data[constraints['production']].name).aggregate({"Ai":
                                                                 np.sum}))
            AiBF["Ai"] = 1 / AiBF["Ai"]
            updates = AiBF.ix[
                pd.match(data[constraints['production']], AiBF.index), "Ai"]
            data["Ai"] = updates.reset_index(level=0, drop=True) if (
                updates.notnull().any()) else data["Ai"]
            #If model is prod constrained stop here - dont need to balance
            if model == 'prodConstrained':
                break
            if its == 1:
                data["OldAi"] = data["Ai"]
            else:
                data["diff"] = abs(
                    (data["OldAi"] - data["Ai"]) / data["OldAi"])
                data["OldAi"] = data["Ai"]
        #If model is att or doubly constrained
        if model != 'prodConstrained':
            calcBj(data, sep, cost, factors, model)
            BjBF = data.groupby(
                data[constraints['attraction']].name).aggregate({"Bj": np.sum})
            BjBF["Bj"] = 1 / BjBF["Bj"]
            updates = BjBF.ix[
                pd.match(data[constraints['attraction']], BjBF.index), "Bj"]
            data["Bj"] = updates.reset_index(level=0, drop=True) if (
                updates.notnull().any()) else data["Bj"]
            if its == 1:
                #If model is att constrained stop here - dont need to balance
                if model == 'attConstrained':
                    break
                data["OldBj"] = data["Bj"]
            else:
                data["diff"] = abs(
                    (data["OldBj"] - data["Bj"]) / data["OldBj"])
                data["OldBj"] = data["Bj"]
        cnvg = np.sum(data["diff"])
        #print cnvg, its
    return data
コード例 #6
0
ファイル: xgb_kernel.py プロジェクト: nickmcadden/Kaggle
def category_transformation(train_categoric, test_categoric, labels, type='std'):

	if type == 'freq':
		print("Encoding categories by freqency rank...")
		for c in train_categoric.columns:
			freqs = train_categoric[c].append(test_categoric[c]).value_counts()
			train_categoric[c] = pd.match(train_categoric[c].values, freqs[0:1000].index)
			test_categoric[c] = pd.match(test_categoric[c].values, freqs[0:1000].index)

	if type == 'tgtrate':
		print("Encoding categories by target rate...")
		for c in train_categoric.columns:
			train_categoric[c], test_categoric[c] = category_to_prob_weight(train_categoric, test_categoric, c, labels)

	return train_categoric, test_categoric
コード例 #7
0
ファイル: views.py プロジェクト: AlexRuizE/YelpReviewRank
def get_highest_reviews(bid):
    other_reviewers_of_restaurant=list(yelp['user_id'][yelp['business_id']==bid])
    uid=random.sample(other_reviewers_of_restaurant, 1)
    user_column=pandas.match(uid, users)[0]
    similarity_indices_for_user=list(df.ix[:,user_column])
    z=numpy.array(similarity_indices_for_user)
    most_similar_users=numpy.argsort(z)[0:10]
    most_similar_users=[users[most_similar_users[i]] for i in range(10)]
    name_rest=yelp['name.business'][yelp['business_id']==bid].unique()[0]
    f =  lambda row: row['user_id'] in most_similar_users and row['name.business'] in name_rest
    k = yelp.apply(f, axis=1)
    temp=yelp[k]
    temp.iloc[:,[1,3,6,9,11,17,21,27]]
    x1=list(temp['stars.review']); x2=list(temp['richness']); x3=list(temp['fans'])
    x4=list(temp['review_count.review']); x5=list(temp['stars.business'])
    predicted_values=list()
    for i in range(len(temp)):
        predicted_values.append([predict_expected_value(x1[i],x2[i],x3[i],x4[i],x5[i]), i])
    predicted_values.sort(key=lambda x: x[0])
    predicted_values=predicted_values[-3:]
    predicted_values.sort(key=lambda x: x[1])
    reviews=list()
    for value in range(len(predicted_values)):
        row_of_review=predicted_values[value][1]
        print(row_of_review)
        reviews.append(temp['text'].iloc[row_of_review])
    return reviews
コード例 #8
0
ファイル: gravity.py プロジェクト: shepherdmeng/pysal
def total_flows(dt, f, locs):
    """
    sum rows or columns to derive total inflows or total outflows
    """

    totals = dt.groupby(locs).aggregate({f: np.sum})
    return totals.ix[pd.match(locs, totals.index.astype(str))].reset_index()[f]
コード例 #9
0
def explain_prediction(bst, explainer, data):
    """

    :param bst:
    :type bst: xgb.Booster
    :param explainer:
    :type explainer: pd.DataFrame
    :param data:
    :return:
    """
    nodes = bst.predict(data, pred_leaf=True)
    colnames = list(explainer.columns.values)[:-2]

    preds_breakdown = pd.DataFrame(np.zeros((nodes.shape[0], len(colnames))),
                                   columns=colnames)

    print("Extracting the breakdown of each prediction...")
    num_trees = nodes.shape[1]
    with click.progressbar(range(num_trees), num_trees) as bar:
        for idx in bar:
            nodes_for_tree = nodes[:, idx]
            tree_breakdown = explainer[explainer["tree"] == idx].fillna(0)
            preds_breakdown_for_tree = tree_breakdown.loc[
                pd.match(nodes_for_tree, tree_breakdown["leaf"])][colnames] \
                .reset_index(drop=True)
            preds_breakdown = preds_breakdown + preds_breakdown_for_tree
    print("DONE!")
    return preds_breakdown
コード例 #10
0
ファイル: filter_outliers.py プロジェクト: sminot/deenurp
def parse_usearch_allpairs(filename, seqnames):
    """Read output of ``usearch -allpairs_global -blast6out`` and return a
    square distance matrix. ``seqnames`` determines the marginal order
    of sequences in the matrix.

    """

    data = pd.read_table(filename, header=None, names=BLAST6NAMES)
    data['dist'] = pd.Series(
        1.0 - data['pct_id'] / 100.0, index=data.index)

    # for each sequence pair, select the longest alignment if there is
    # more than one (chooses first occurrence if there are two the same
    # length).
    maxidx = data.groupby(['query', 'target']).apply(
        lambda x: x['align_len'].idxmax())
    data = data.iloc[maxidx]

    if set(seqnames) != set(data['query']) | set(data['target']):
        # shutil.copy(filename, '.')
        raise UsearchError(
            'some sequences are missing from the output ({})'.format(filename))

    nseqs = len(seqnames)
    distmat = numpy.repeat(0.0, nseqs ** 2)
    distmat.shape = (nseqs, nseqs)
    ii = pd.match(data['query'], seqnames)
    jj = pd.match(data['target'], seqnames)

    # usearch_allpairs_files returns comparisons corresponding to a
    # triangular matrix, whereas vsearch_allpairs_files returns all
    # comparisons. Here we convert both to a square matrix.
    if data.shape[0] == nseqs * nseqs:
        distmat[ii, jj] = data['dist']
    elif data.shape[0] == (nseqs * (nseqs - 1)) / 2:
        distmat[ii, jj] = data['dist']
        distmat[jj, ii] = data['dist']
    else:
        msg = 'not all pairwise comparisons are represented ({})'
        raise UsearchError(msg.format(filename))

    return distmat
コード例 #11
0
def parse_usearch_allpairs(filename, seqnames):
    """Read output of ``usearch -allpairs_global -blast6out`` and return a
    square distance matrix. ``seqnames`` determines the marginal order
    of sequences in the matrix.

    """

    data = pd.read_table(filename, header=None, names=BLAST6NAMES)
    data['dist'] = pd.Series(
        1.0 - data['pct_id'] / 100.0, index=data.index)

    # for each sequence pair, select the longest alignment if there is
    # more than one (chooses first occurrence if there are two the same
    # length).
    maxidx = data.groupby(['query', 'target']).apply(
        lambda x: x['align_len'].idxmax())
    data = data.iloc[maxidx]

    if set(seqnames) != set(data['query']) | set(data['target']):
        # shutil.copy(filename, '.')
        raise UsearchError(
            'some sequences are missing from the output ({})'.format(filename))

    nseqs = len(seqnames)
    distmat = numpy.repeat(0.0, nseqs ** 2)
    distmat.shape = (nseqs, nseqs)
    ii = pd.match(data['query'], seqnames)
    jj = pd.match(data['target'], seqnames)

    # usearch_allpairs_files returns comparisons corresponding to a
    # triangular matrix, whereas vsearch_allpairs_files returns all
    # comparisons. Here we convert both to a square matrix.
    if data.shape[0] == nseqs * nseqs:
        distmat[ii, jj] = data['dist']
    elif data.shape[0] == (nseqs * (nseqs - 1)) / 2:
        distmat[ii, jj] = data['dist']
        distmat[jj, ii] = data['dist']
    else:
        msg = 'not all pairwise comparisons are represented ({})'
        raise UsearchError(msg.format(filename))

    return distmat
コード例 #12
0
def getUserAppsDF():
    global jobDF, userDF, userAppsDF, userAppsDict, UserAppsSparseMatrix
    userAppsDF = pd.read_csv(projectHomeSJ +
                             "/Input/ApplicationData_sorted.csv",
                             names=['userId', 'jobId'])
    userAppsDF = userAppsDF[userAppsDF.jobId.isin(jobDF.job)]
    userDF = pd.DataFrame({'userId': userAppsDF.userId.unique()})
    userAppsDF['userLookUp'] = pd.match(userAppsDF['userId'], userDF['userId'])
    userAppsDF['jobLookUp'] = pd.match(userAppsDF['jobId'], jobDF['job'])
    userAppsDF_1 = pd.DataFrame(
        list(collectionCA.find({}, {
            '_id': 1,
            'userApps': 1
        })))
    userAppsDict = dict(zip(userAppsDF_1['_id'], userAppsDF_1['userApps']))
    del userAppsDF_1
    UserAppsSparseMatrix = sp.coo_matrix(
        (np.repeat(1, userAppsDF.shape[0]),
         (userAppsDF['userLookUp'], userAppsDF['jobLookUp'])),
        shape=(userDF.shape[0], jobDF.shape[0]))
    UserAppsSparseMatrix = UserAppsSparseMatrix.tocsr()
コード例 #13
0
ファイル: memo.py プロジェクト: vg29029/discover-notebooks
def memo_test(events, selected_genes, groups, permutations=10000):
    groups_memo = [pandas.match(group, selected_genes) for group in groups]
    events_selected = events[selected_genes]
    sampler = switching.EventMatrixSampler(events_selected.astype(int),
                                           "gobbi")

    coverages = numpy.array(
        [events_selected[i].any(0).sum() for i in groups_memo])
    higher_coverage = numpy.zeros_like(coverages)

    for i in xrange(permutations):
        null_sample = sampler.sample()
        higher_coverage += (numpy.array(
            [null_sample[i].any(0).sum()
             for i in groups_memo]) >= coverages).astype(int)

    return (higher_coverage + 1.0) / (permutations + 1.0)
コード例 #14
0
ファイル: gravity.py プロジェクト: shepherdmeng/pysal
    def calc_Bj(self, dt, d, of, p, dc=False):
        """
        calculate Bj balancing factor
        """
        Bj = self.calc_dcy(self.c, self.cf, p)

        if of:
            for fx in of:
                Bj *= of[fx]**p[fx]
        if not dc:
            dt['Bj'] = Bj
        else:
            dt['Bj'] = Bj*dt['Ai']*dt['Oi']
        Bj = (dt.groupby(d).aggregate({'Bj': np.sum}))
        Bj['Bj'] = 1/Bj['Bj']
        Bj = Bj.ix[pd.match(d, Bj.index), 'Bj']
        return Bj.reset_index(level=0, drop=True)
コード例 #15
0
ファイル: gravity.py プロジェクト: shepherdmeng/pysal
    def calc_Ai(self, dt, o, df, p, dc=False):
        """
        calculate Ai balancing factor
        """
        Ai = self.calc_dcy(self.c, self.cf, p)

        if df:
            for fx in df:
                Ai *= df[fx]**p[fx]

        if not dc:
            dt['Ai'] = Ai
        else:
            dt['Ai'] = Ai*dt['Bj']*dt['Dj']
        Ai = (dt.groupby(o).aggregate({'Ai': np.sum}))
        Ai['Ai'] = 1/Ai['Ai']
        Ai = Ai.ix[pd.match(o, Ai.index), 'Ai']
        return Ai.reset_index(level=0, drop=True)
コード例 #16
0
ファイル: comp.py プロジェクト: mikemunsell/BNPTSClust-python
def comp(vector):

    # Function that computes the distinct observations in a numeric vector.
    # It is based entirely on the "comp11" function from the BNPTSclust
    # package in R created by David Alejandro Martell Juarez
    #
    # IN:
    #
    # vector <- numeric vector.
    #
    # OUT:
    #
    # jstar <- variable that rearranges the input vector into a vector with only
    #          its unique values.
    # nstar <- frequency of each distinct observation in the input vector.
    # rstar <- number of distinct observations in the input vector.
    # gn    <- variable that indicates the group number to which every
    #          entry in the input vector belongs.

    n = len(vector)

    mat = vector[:, None] == vector

    jstar = np.repeat(False, n)

    led = np.repeat(False, n)

    for j in np.arange(0, n):
        if not led[j]:
            jstar[j] = True
            if j + 1 == n:
                break
            ji = np.arange(j + 1, n)
            tt = mat[ji, j] == True
            led[ji] = led[ji] | tt
        if all(np.delete(led, np.arange(0, j + 1))):
            break

    ystar = vector[jstar]
    nstar = np.apply_along_axis(np.sum, 0, mat[:, jstar])
    rstar = len(nstar)
    gn = pd.match(vector, ystar)

    return jstar, nstar, rstar, gn
コード例 #17
0
            if s >= .6 : cn+=1
        if cn > 0:
            res.append([se_id[i],unip,tname,cn])
    return res

compound_target_mapping = []
for i,r in target_pairs.iterrows():
    if i%200 == 0 and i > 0:
        print i

    # target names/symbols
    t1 = r['TARGET_A']
    t2 = r['TARGET_B']

    # get corresponding uniprots
    unip1 = targets.iloc[pd.match([t1],targets['SYMBOL'])]['UNIPROT'].tolist()[0]
    unip2 = targets.iloc[pd.match([t2],targets['SYMBOL'])]['UNIPROT'].tolist()[0]

    # get twosides compounds that have >.6 similarity to
    # at least one target active in ChEMBL
    se_cp1 = get_se_compound(unip1,t1)
    se_cp2 = get_se_compound(unip2,t2)
    
    if len(se_cp1) > 0 and len(se_cp2) > 0:
        # compound_target_mapping.append(se_cp1+se_cp2)

        for v1 in se_cp1 : # for 1st target/stitch compounds association
            for v2 in se_cp2: # for 2nd target/stitch compounds association
                # if both the compound for the 1st target and the compound for the 2nd target
                # are in the twosides dataset 
                if v1[0] in SE_data['stitch_id1'] and v2[0] in SE_data['stitch_id2']: 
コード例 #18
0
 def test_match(self):
     with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
         pd.match([1, 2, 3], [1])
コード例 #19
0
ファイル: test_api.py プロジェクト: glyg/pandas
 def test_match(self):
     with tm.assert_produces_warning(FutureWarning,
                                     check_stacklevel=False):
         pd.match([1, 2, 3], [1])
コード例 #20
0
def setup(data, trips, sep, cost, factors, constraints, prodCon, attCon,
          initialParams, Oi, Dj, totalFlows):
    """
    set up all initial variables and balancing factors for mle calibration
    """

    #The following setup is for within all models

    #There is always a beta parameter so set it to user's initial value and add to param list
    data['beta'] = initialParams['beta']
    params = ['beta']

    #This is the observed data for which we want to derive parameters
    if cost == 'exp':
        knowns = data[sep]
    elif cost == 'pow':
        knowns = np.log(data[sep])
    else:
        sys.exit(
            sys.exit(
                "The distance/cost function must be either 'pow' or 'exp'."))

    #For doubly constrained model
    if (prodCon == True) & (attCon == True):

        #Variables for constants and deriving them
        data["Bj"] = 1.0
        data["Ai"] = 1.0
        data["OldAi"] = 10.000000000
        data["OldBj"] = 10.000000000
        data["diff"] = abs((data["OldAi"] - data["Ai"]) / data["OldAi"])

        #Calc total outflows and inflows
        if Oi:
            data["Oi"] = data[Oi]
        else:
            Oi = data.groupby(data[constraints['production']]).aggregate(
                {trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']],
                                        Oi.index)].reset_index()[trips]

        if Dj:
            data["Dj"] = data[Dj]
        else:
            Dj = data.groupby(data[constraints['attraction']]).aggregate(
                {trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']],
                                        Dj.index)].reset_index()[trips]

    #For Production Constrained model
    if (prodCon == True) & (attCon == False):

        #Calc total outflows
        if factors == None:
            if not Dj:
                Dj = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Dj"] = Dj.ix[pd.match(
                    data[totalFlows],
                    Dj.index)].reset_index()[trips].sort_index()

            else:
                data["Dj"] = data[Dj]

        if not Oi:
            Oi = data.groupby(data[constraints['production']]).aggregate(
                {trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']],
                                        Oi.index)].reset_index()[trips]
        else:
            data['Oi'] = data[Oi]

    #For Attraction Constrained model
    if (prodCon == False) & (attCon == True):

        #Calc total inflows
        if factors == None:
            if not Oi:
                Oi = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Oi"] = Oi.ix[pd.match(data[totalFlows],
                                            Oi.index)].reset_index()[trips]
            else:
                data["Oi"] = data[Oi]
        if not Dj:
            Dj = data.groupby(data[constraints['attraction']]).aggregate(
                {trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']],
                                        Dj.index)].reset_index()[trips]
        else:
            data["Dj"] = data[Dj]

    #For Unconstrained Model
    if (prodCon == False) & (attCon == False):
        for factor in factors['origins']:
            #Include that information in the model
            knowns = knowns + np.log(data[factor])
            #Add to params list
            params.append(str(factor))
            #variable param vector
            data[str(factor) + 'Param'] = initialParams[factor]
        for factor in factors['destinations']:
            #Include that informatio in the model
            knowns = knowns + np.log(data[factor])
            #Add to params list
            params.append(str(factor))
            #variable param vector
            data[str(factor) + 'Param'] = initialParams[factor]

    #For all models besides unconstrained - is probably redundant and can be refactored

    #If there are additional factors we will include that observed data, add it to param list, and add a data vector for the param
    if factors != None:
        if attCon != False:
            for factor in factors['origins']:
                #Include that information in the model
                knowns = knowns + np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                data[str(factor) + 'Param'] = initialParams[factor]
        if prodCon != False:
            for factor in factors['destinations']:
                #Include that informatio in the model
                knowns = knowns + np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                data[str(factor) + 'Param'] = initialParams[factor]

    #Observed information is sum of trips multiplied by the log of known information
    observed = np.sum(data[trips] * knowns)

    #return observed info, data, knownn info, and params list
    return observed, data, knowns, params
コード例 #21
0
ファイル: data4.py プロジェクト: nickmcadden/Kaggle
def load(m_params):
    num_features = m_params['n_features']
    minbin = m_params['minbin']
    getcached = m_params['getcached']
    codetest = m_params['codetest']

    trainfilename = 'train_' + str(num_features) + str(minbin) + '.h5'
    testfilename = 'test_' + str(num_features) + str(minbin) + '.h5'

    # Read HDF format file
    print("1a. Reading the train and test data...\n")
    if getcached and os.path.isfile('input/' + trainfilename):

        train = pd.read_hdf('input/' + trainfilename, 'train')
        test = pd.read_hdf('input/' + testfilename, 'test')
        labels = train['target']
        test_ids = test['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

        return train.values, labels.values, test.values, test_ids.values

    else:
        train = pd.read_hdf('input/train.h5', 'train')
        test = pd.read_hdf('input/test.h5', 'test')

        if codetest:
            train = train.ix[0:999, :]
            test = test.ix[0:999, :]

        labels = train['target']
        test_ids = test['ID']
        train_ids = train['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

        print(
            "1c. Breaking dataframe into numeric, object and date parts...\n")
        train_numeric = train.select_dtypes(include=['float64', 'int64'])
        test_numeric = test.select_dtypes(include=['float64', 'int64'])

        train_categoric = train.select_dtypes(include=['object'])
        test_categoric = test.select_dtypes(include=['object'])

        train_dates = train.select_dtypes(include=['datetime64[ns]'])
        test_dates = test.select_dtypes(include=['datetime64[ns]'])

        # Zip code engineering
        print("2. Zip code engineering...\n")
        train['VAR_0241'] = train['VAR_0241'].fillna(99999)
        test['VAR_0241'] = test['VAR_0241'].fillna(99999)
        train_zips = np.empty([train.shape[0], 7])
        test_zips = np.empty([test.shape[0], 7])
        try:
            zp = train['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            train_zips[:, 0] = zp.map(lambda x: x[:2]).astype('int32')
            train_zips[:, 1] = zp.map(lambda x: x[:1] + x[-1:]).astype('int32')
            train_zips[:, 2] = zp.map(lambda x: x[:3]).astype('int32')
            train_zips[:, 3] = zp.map(lambda x: x[1:3]).astype('int32')
            train_zips[:, 4] = zp.map(lambda x: x[1:4]).astype('int32')
            train_zips[:, 5] = zp.map(lambda x: x[2:4]).astype('int32')
            train_zips[:, 6] = zp.map(lambda x: x[3:5]).astype('int32')
            zp = test['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            test_zips[:, 0] = zp.map(lambda x: x[:2]).astype('int32')
            test_zips[:, 1] = zp.map(lambda x: x[:1] + x[-1:]).astype('int32')
            test_zips[:, 2] = zp.map(lambda x: x[:3]).astype('int32')
            test_zips[:, 3] = zp.map(lambda x: x[1:3]).astype('int32')
            test_zips[:, 4] = zp.map(lambda x: x[1:4]).astype('int32')
            test_zips[:, 5] = zp.map(lambda x: x[2:4]).astype('int32')
            test_zips[:, 6] = zp.map(lambda x: x[3:5]).astype('int32')

            zipcolumns = [
                'zip0', 'zip1', 'zip2', 'zip3', 'zip4', 'zip5', 'zip6'
            ]
            train_zips = pd.DataFrame(train_zips, columns=zipcolumns)
            test_zips = pd.DataFrame(test_zips, columns=zipcolumns)
        except:
            print('Zip codes cant be encoded')
            exit()

        # Deal with categorical data
        print("3. Categorical variable encoding... \n")
        for c in train_categoric.columns:
            freqs = train_categoric[c].append(test_categoric[c]).value_counts()
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          freqs[0:70].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         freqs[0:70].index)

        # Deal with categorical data
        print("4. Numeric Column Smoothing... \n")
        train_numeric = train_numeric.fillna(0)
        test_numeric = test_numeric.fillna(0)
        numeric_col_count = 0
        if minbin > 1:
            for c in train_numeric.columns:
                train_numeric[c], test_numeric[c] = bin(
                    train_numeric[c], test_numeric[c], labels, minbin)
                numeric_col_count += 1
                if not (numeric_col_count % 10):
                    print('Numeric Col Count: ', numeric_col_count)

        gc.collect()

        # Create new date transformations
        print('5. Create new date columns...\n')

        def tdtoint(td):
            if not pd.isnull(td):
                return td.astype('timedelta64[D]').astype(np.int32)
            else:
                return 0

        # Diffs between important dates
        for i in [
                'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                'VAR_0169', 'VAR_0178', 'VAR_0166'
        ]:
            for j in [
                    'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                    'VAR_0169', 'VAR_0178', 'VAR_0166'
            ]:
                if i < j:
                    keypair = i + '_' + j
                else:
                    keypair = j + '_' + i
                if i != j and keypair not in train_dates.columns:
                    train_dates[keypair] = train_dates[i] - train[j]
                    train_dates[keypair] = train_dates[keypair].apply(tdtoint)
                    test_dates[keypair] = test_dates[i] - test_dates[j]
                    test_dates[keypair] = test_dates[keypair].apply(tdtoint)

        # Date Splits
        datecols = pd.read_pickle('input/datecols.pkl')
        for c in datecols['col'].values.tolist():
            train_dates[c + '_y'] = train_dates[c].dt.year
            train_dates[c + '_m'] = train_dates[c].dt.month
            train_dates[c + '_d'] = train_dates[c].dt.day
            train_dates[c + '_wd'] = train_dates[c].dt.weekday
            train_dates[c + '_hr'] = train_dates[c].dt.hour
            test_dates[c + '_y'] = test_dates[c].dt.year
            test_dates[c + '_m'] = test_dates[c].dt.month
            test_dates[c + '_d'] = test_dates[c].dt.day
            test_dates[c + '_wd'] = test_dates[c].dt.weekday
            test_dates[c + '_hr'] = test_dates[c].dt.hour

        train_dates.drop(datecols['col'].values.tolist(), axis=1, inplace=True)
        test_dates.drop(datecols['col'].values.tolist(), axis=1, inplace=True)

        gc.collect()

    print("5. Merging arrays together...\n")
    # put seperate parts together again

    train = pd.concat(
        [train_categoric, train_dates, train_numeric, train_zips], axis=1)
    test = pd.concat([test_categoric, test_dates, test_numeric, test_zips],
                     axis=1)

    # Get only top n features
    print("1b. Filtering by pickled important columns...\n")
    cols = pd.read_pickle("input/vars_importance.pkl")
    cols = list(cols.ix[0:num_features, "var"])

    for c in cols:
        if c not in train.columns:
            cols.remove(c)

    train = train[cols].fillna(0)
    test = test[cols].fillna(0)

    gc.collect()

    try:
        print("6. Writing to hdf format...\n")
        pd.concat([train_ids, train, labels],
                  axis=1).to_hdf('input/' + trainfilename,
                                 key='train',
                                 format='fixed',
                                 mode='w')
        pd.concat([test_ids, test], axis=1).to_hdf('input/' + testfilename,
                                                   key='test',
                                                   format='fixed',
                                                   mode='w')
    except:
        error = sys.exc_info()[0]
        print("Error: %s" % error)

    return train.values, labels.values, test.values, test_ids.values
コード例 #22
0
sider_se = sider_se[sider_se[3] == 'PT']

#sider_fq = pd.read_table('/home/az338/ucc-fileserver/sider/meddra_freq.tsv',header=None)
#sider_fq = sider_fq.drop(0,1)


# concatenate sider and offsides side effect
sider_se = sider_se.drop([3,4],1)
sider_se.columns = ['stitch_id','umls_id','event']
allSE = pd.concat([sider_se,offsides[['stitch_id','umls_id','event']]])
allSE.to_csv(DATA_DIR+'sider_and_offsides_SE.csv')

# load stitch_id  smiles mapping dataset 
stitch_smiles = pd.read_table('/home/az338/ucc-fileserver/stitch_v4/chemicals.v4.0.tsv')


# match stitch id in SE dataset to smiles
idx_offsides = pd.match(offsides['stitch_id'],stitch_smiles['chemical'])
idx_offsides = idx_offsides[idx_offsides >= 0]

idx_sider = pd.match(sider_se['stitch_id'],stitch_smiles['chemical'])
idx_sider = idx_sider[idx_sider >= 0]

# concatenate sider and offsides structures
offsides_struct = stitch_smiles.iloc[idx_offsides][['chemical','SMILES_string']].drop_duplicates()
sider_struct = stitch_smiles.iloc[idx_sider][['chemical','SMILES_string']].drop_duplicates()
all_struct = pd.concat([offsides_struct,sider_struct]).drop_duplicates()
all_struct.to_csv(DATA_DIR+'sider_offsides_smiles.csv')


コード例 #23
0
import pandas as pd


DATA_DIR = '/home/az338/ucc-fileserver/AZ_challenge_data/'
SE_DATA_DIR = '/scratch/az338/ucc-fileserver/side_effects_data/'

# load twosides
SE_data = pd.read_table(SE_DATA_DIR+'3003377s-twosides.tsv')

# load compound structures 
SE_struct = pd.read_table('/home/az338/ucc-fileserver/stitch_v4/chemicals.v4.0.tsv')

# get unique compound identifiers in twosides
twosides_compounds = list(set(SE_data['stitch_id1'].values.tolist()+SE_data['stitch_id2'].values.tolist()))

# match these compounds to the OFFSIDES/SIDER structural dataset 
# to match the stitch IDs to their structure
idx = pd.match(twosides_compounds,SE_struct['chemical'])
twosides_struct = SE_struct.iloc[idx[idx>0]]
twosides_struct=twosides_struct.drop('molecular_weight',axis=1)

twosides_struct.to_csv(SE_DATA_DIR+'twosides_smiles.csv')
コード例 #24
0
ファイル: algorithms.py プロジェクト: BobMcFry/pandas
 def time_match_string(self):
     with warnings.catch_warnings(record=True):
         pd.match(self.all, self.uniques)
コード例 #25
0
def load(m_params):

    num_features = m_params['n_features']
    minbin = m_params['minbin']
    getcached = m_params['getcached']

    t0 = time.time()

    trainfilename = 'train_' + str(num_features) + str(minbin) + '.h5'
    testfilename = 'test_' + str(num_features) + str(minbin) + '.h5'

    # Read HDF format file
    print("1. Reading the train and test data...\n")

    if getcached and os.path.isfile(trainfilename):

        train = pd.read_hdf(trainfilename, 'train')
        test = pd.read_hdf(testfilename, 'test')
        labels = train['target']
        test_ids = test['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

        return train.values, labels.values, test.values, test_ids.values

    elif getcached and os.path.isfile('train_binned_' + str(minbin) + '.h5'):

        train = pd.read_hdf('train_binned_' + str(minbin) + '.h5', 'train')
        test = pd.read_hdf('test_binned_' + str(minbin) + '.h5', 'test')
        labels = train['target']
        test_ids = test['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

    else:

        train = pd.read_hdf('train.h5', 'train')
        test = pd.read_hdf('test.h5', 'test')

        labels = train['target']
        test_ids = test['ID']

        gc.collect()

        print("Postcode column \n")
        print(train['VAR_0241'].dtype, len(np.unique(train['VAR_0241'])))
        print(test['VAR_0241'].dtype, len(np.unique(test['VAR_0241'])))

        # Zip code engineering
        print("4. Zip code engineering...\n")
        train['VAR_0241'] = train['VAR_0241'].fillna(99999)
        test['VAR_0241'] = test['VAR_0241'].fillna(99999)
        try:
            zp = train['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            train['zip_00xxx'] = zp.map(lambda x: x[:2]).astype('int32')
            train['zip_0xxx0'] = zp.map(lambda x: x[:1] + x[-1:]).astype(
                'int32')
            train['zip_000xx'] = zp.map(lambda x: x[:3]).astype('int32')
            train['zip_x00xx'] = zp.map(lambda x: x[1:3]).astype('int32')
            train['zip_x000x'] = zp.map(lambda x: x[1:4]).astype('int32')
            train['zip_xx00x'] = zp.map(lambda x: x[2:4]).astype('int32')
            train['zip_xxx00'] = zp.map(lambda x: x[3:5]).astype('int32')
            zp = test['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            test['zip_00xxx'] = zp.map(lambda x: x[:2]).astype('int32')
            test['zip_0xxx0'] = zp.map(lambda x: x[:1] + x[-1:]).astype(
                'int32')
            test['zip_000xx'] = zp.map(lambda x: x[:3]).astype('int32')
            test['zip_x00xx'] = zp.map(lambda x: x[1:3]).astype('int32')
            test['zip_x000x'] = zp.map(lambda x: x[1:4]).astype('int32')
            test['zip_xx00x'] = zp.map(lambda x: x[2:4]).astype('int32')
            test['zip_xxx00'] = zp.map(lambda x: x[3:5]).astype('int32')
        except:
            print('BOLLOCKS Zip codes cant be encoded')
            exit()

        # Deal with categorical data and smoothing
        print(
            "2. Categorical variable encoding and numeric col smoothing... \n")
        numeric_col_count = 0
        for c in train.columns[1:-1]:
            if train[c].name != 'target':
                if train[c].dtype.name == 'object':
                    freqs = train[c].append(test[c]).value_counts()
                    train[c] = pd.match(train[c].values, freqs[0:70].index)
                    test[c] = pd.match(test[c].values, freqs[0:70].index)
                elif train[c].dtype.name in ['int64', 'float64'
                                             ] and minbin > 1:
                    # smooth numeric cols
                    train[c] = bin(train[c], train[c], train['target'], minbin)
                    test[c] = bin(test[c], train[c], train['target'], minbin)
                    numeric_col_count += 1
                    if not (numeric_col_count % 10):
                        print('Numeric Col Count: ', numeric_col_count)

        gc.collect()

        # Create new date transformations
        print('3. Create new date columns...\n')

        def tdtoint(td):
            if not pd.isnull(td):
                return td.astype('timedelta64[D]').astype(int)
            else:
                return 0

        # Diffs between important dates
        for i in [
                'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                'VAR_0169', 'VAR_0178', 'VAR_0166'
        ]:
            for j in [
                    'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                    'VAR_0169', 'VAR_0178', 'VAR_0166'
            ]:
                if i < j:
                    keypair = i + '_' + j
                else:
                    keypair = j + '_' + i
                if i != j and keypair not in train.columns:
                    train[keypair] = train[i] - train[j]
                    train[keypair] = train[keypair].apply(tdtoint)
                    test[keypair] = test[i] - test[j]
                    test[keypair] = test[keypair].apply(tdtoint)

        # Date Splits
        datecols = pd.read_pickle('datecols.pkl')
        for c in datecols['col'].values.tolist():
            train[c + '_y'] = train[c].dt.year
            train[c + '_m'] = train[c].dt.month
            train[c + '_d'] = train[c].dt.day
            train[c + '_wd'] = train[c].dt.weekday
            train[c + '_hr'] = train[c].dt.hour
            test[c + '_y'] = test[c].dt.year
            test[c + '_m'] = test[c].dt.month
            test[c + '_d'] = test[c].dt.day
            test[c + '_wd'] = test[c].dt.weekday
            test[c + '_hr'] = test[c].dt.hour

        train.drop(datecols['col'].values.tolist(), axis=1, inplace=True)

        gc.collect()

        # Fill any remaining N/As
        train = train.fillna(0)
        test = test.fillna(0)

        #print("4.5. Writing to hdf format...\n")
        #train.to_hdf('train_binned_' + str(minbin) + '.h5',key='train',format='fixed',mode='w')
        #test.to_hdf('test_binned_' + str(minbin) + '.h5',key='test',format='fixed',mode='w')

    # Get only top n features
    print("5. Filtering by pickled important columns...\n")
    cols = pd.read_pickle("vars_importance.pkl")
    cols = cols.ix[0:num_features, "var"].tolist()

    print("6. Writing to hdf format...\n")
    #zipcols = ['zip_00xxx', 'zip_0xxx0', 'zip_000xx', 'zip_x00xx', 'zip_x000x', 'zip_xx00x', 'zip_xxx00']
    zipcols = []
    train[cols + zipcols + ['ID', 'target']].to_hdf(trainfilename,
                                                    key='train',
                                                    format='fixed',
                                                    mode='w')
    test[cols + zipcols + ['ID']].to_hdf(testfilename,
                                         key='test',
                                         format='fixed',
                                         mode='w')

    train = train[cols + zipcols]
    test = test[cols + zipcols]

    gc.collect()

    return train.values, labels.values, test.values, test_ids.values
コード例 #26
0
ファイル: nauka_pythona.py プロジェクト: wosarn/python

path_var = 'E:\\Wojtek\\_DSCN_\\Analiza_danych\\Leas\\data_set\\'
var_df = pd.read_csv(path_var+'variables.csv', sep=';')
var_df.columns()
var_df.type_var.value_counts()
var_df.head(10)
var_df.type_pred.value_counts(dropna=False)


#predictors = var_df.variable[ (var_df.type_var == 'pred') | (var_df.variable == target_name) ]
predictors = var_df.variable[ var_df.type_var == 'pred' ]
ind_target = data_frame.columns.get_loc(target_name)
type(predictors)
#type(list)(predictors)
pd.match(df.columns, predictors) #zwraca pozycję wystąpienia: R-owy odpowiednik which( vect1 %in% vec2 )


col = df.columns.isin(predictors)+df.columns.isin([target_name])  # to raczej nie jest eleganckie rozwiązanie

target_name = 'TR_D90M12'
df_filtered = df.loc[ (df.FFINRPFH_czy == 1) 
            & (df.TR_ANEKS_RODZAJ_id  == 0) 
            & (df.TR_POZIOM_wykonanie == 1)  
            & (df.PORECZYCIEL_CZY == 0)
            & (pd.isnull(df.TR_FRAUD_DataStatusu)) 
            & (~ pd.isnull(df.loc[:,target_name])), col]
df_filtered.shape

df_filtered2 = df_filtered.dropna(axis = 0, how = 'any')
df_filtered2.shape
コード例 #27
0
def get_dense_specs():
    train = pd.read_csv('../input/train_set.csv', parse_dates=[
        2,
    ])
    test = pd.read_csv('../input/test_set.csv', parse_dates=[
        3,
    ])
    tube = pd.read_csv('../input/tube.csv',
                       true_values=['Y'],
                       false_values=['N'])
    materials = pd.read_csv('../input/bill_of_materials.csv')
    aggs = pd.read_csv('../input/ta_aggs.csv')
    components = pd.read_csv('../input/components.csv')

    train = pd.merge(train, tube, on='tube_assembly_id')
    test = pd.merge(test, tube, on='tube_assembly_id')
    train = pd.merge(train, materials, on='tube_assembly_id')
    test = pd.merge(test, materials, on='tube_assembly_id')

    train = pd.merge(train, aggs, on='tube_assembly_id', how='left')
    test = pd.merge(test, aggs, on='tube_assembly_id', how='left')

    # create some new features
    train['year'] = train.quote_date.dt.year
    train['month'] = train.quote_date.dt.month

    test['year'] = test.quote_date.dt.year
    test['month'] = test.quote_date.dt.month

    train['odd'] = train.quantity % 2
    test['odd'] = test.quantity % 2

    train['div5'] = (train.quantity % 5)
    test['div5'] = (test.quantity % 5)

    train['material_id'].replace(np.nan, ' ', regex=True, inplace=True)
    test['material_id'].replace(np.nan, ' ', regex=True, inplace=True)

    train['bracket_pricing'] = train['bracket_pricing'].replace(['Yes', 'No'],
                                                                [1, 0])
    test['bracket_pricing'] = test['bracket_pricing'].replace(['Yes', 'No'],
                                                              [1, 0])

    fields_to_encode = [
        'supplier', 'material_id', 'end_a', 'end_x', 'end_a_1x', 'end_a_2x',
        'end_x_1x', 'end_x_2x', 'bracket_pricing'
    ]

    for i in range(1, 9):
        column_label = 'component_id_' + str(i)
        fields_to_encode.append(column_label)
        tmp = pd.merge(train,
                       components,
                       left_on=column_label,
                       right_on='component_id',
                       how='left')['component_type_id']
        train[column_label] = tmp
        tmp = pd.merge(test,
                       components,
                       left_on=column_label,
                       right_on='component_id',
                       how='left')['component_type_id']
        test[column_label] = tmp
        train[column_label].replace(np.nan, ' ', regex=True, inplace=True)
        test[column_label].replace(np.nan, ' ', regex=True, inplace=True)

    for j, clf in enumerate(train.columns.tolist()):
        print(j, clf)
    '''    
    # label encode the categorical variables
    for i in fields_to_encode:
        print('Encoding',i)
        lbl = LabelEncoder()
        lbl.fit(list(train.ix[:,i]) + list(test.ix[:,i]))
        train.ix[:,i] = lbl.transform(train.ix[:,i])
        test.ix[:,i] = lbl.transform(test.ix[:,i])

    for i in fields_to_encode:
        print('Encoding',i)
        freqs = train[i].append(test[i]).value_counts()
        train[i] = pd.match(train[i].values, freqs[0:45].index)
        test[i] = pd.match(test[i].values, freqs[0:45].index)
    '''
    for i in fields_to_encode:
        print('Encoding', i)
        rank = pd.concat([train[i], train['cost']],
                         axis=1).groupby(i).mean().sort('cost',
                                                        ascending=False)
        print(rank[0:20])
        train[i] = pd.match(train[i].values, rank[0:45].index)
        test[i] = pd.match(test[i].values, rank[0:45].index)

    train.fillna(0, inplace=True)
    test.fillna(0, inplace=True)

    return train, test
コード例 #28
0
core_variables = [
    'DO_mgL', 'satDO_mgL', 'DOsat_pct', 'WaterTemp_C', 'Depth_m', 'Level_m',
    'Discharge_m3s', 'Light_PAR', 'Light_lux'
]

varcells = []
for x in sitedata.Variables:
    if x is None:
        varcells.append(x)
    else:
        var_arr = np.asarray(x.split(','))
        isCore = np.in1d(var_arr, core_variables)
        core = var_arr[isCore]
        not_core = var_arr[~isCore]
        if any(core):
            core = core[np.argsort(pd.match(core, core_variables))]
        not_core.sort()
        var_arr = ', '.join(np.concatenate((core, not_core)))
        varcells.append(var_arr)

for i in xrange(len(varcells)):
    if varcells[i] is None:
        varcells[i] = '-'

sitedata.Variables = varcells
fr = sitedata['firstRecord'].dt.strftime('%Y-%m-%d')
lr = sitedata['lastRecord'].dt.strftime('%Y-%m-%d')
timerange = fr + ' to ' + lr
sitedata['Coverage'] = timerange.apply(lambda x: x
                                       if x != 'NaT to NaT' else '-')
コード例 #29
0
ファイル: predict.py プロジェクト: tianbu/ExPecto
        index_end = int(np.minimum(
            (args.splitIndex) * np.ceil(float(h5f.shape[0] / 2) / args.splitFold), (h5f.shape[0] / 2)))
    else:
        index_start = 0
        index_end = int(h5f.shape[0] / 2)

    snp_temp = (np.asarray(h5f[index_start:index_end,:])+ np.asarray(h5f[index_start+int(h5f.shape[0]/2):index_end+int(h5f.shape[0]/2),:]))/2.0
    snpEffects.append(snp_temp)


coor = pd.read_csv(args.coorFile,sep='\t',header=None)
coor = coor.iloc[index_start:index_end,:]

#Fetch the distance to TSS information
gene = pd.read_csv(args.geneFile,sep='\t',header=None)
geneinds = pd.match(coor.iloc[:,0].map(str).str.replace('chr','')+' '+coor.iloc[:,1].map(str),
            gene.iloc[:,0].map(str).str.replace('chr','')+' '+gene.iloc[:,2].map(str))
if np.any(geneinds==-1):
    raise ValueError("Gene association file does not match the vcf file.")
if args.fixeddist == 0:
    dist = - np.asarray(gene.iloc[geneinds,-1])
else:
    dist = args.fixeddist
genename = np.asarray(gene.iloc[geneinds,-2])
strand= np.asarray(gene.iloc[geneinds,-3])

#comptue expression effects
snpExpEffects = compute_effects(snpEffects, \
                                dist, strand,\
                                models, maxshift=maxshift, nfeatures=args.nfeatures,
                                batchSize = args.batchSize)
#write output
コード例 #30
0
ファイル: xgb_big.py プロジェクト: nickmcadden/Kaggle
    train[c + '_m'] = train[c].dt.month
    train[c + '_d'] = train[c].dt.day
    train[c + '_wd'] = train[c].dt.weekday
    train[c + '_hr'] = train[c].dt.hour
    test[c + '_y'] = test[c].dt.year
    test[c + '_m'] = test[c].dt.month
    test[c + '_d'] = test[c].dt.day
    test[c + '_wd'] = test[c].dt.weekday
    test[c + '_hr'] = test[c].dt.hour
train.drop(datecols['col'].values.tolist(), axis=1, inplace=True)

print("categorical variable encoding and cleaning...\n")
for c in train.columns[1:-1]:
    if train[c].dtype.name == 'object':
        freqs = train[c].append(test[c]).value_counts()
        train[c] = pd.match(train[c].values, freqs[0:70].index)
        test[c] = pd.match(test[c].values, freqs[0:70].index)

train = train.fillna(0)
test = test.fillna(0)

labels = train['target']
train.drop(['ID', 'target'], axis=1, inplace=True)
features = train.columns.values

print("filtering by pickled important columns...\n")
vars = pd.read_pickle("vars_importance.pkl")
train = train[vars.ix[0:1250, "var"].tolist()]
test = test[vars.ix[0:1250, "var"].tolist()]

print("converting to numpy array...\n")
コード例 #31
0
ファイル: entropy.py プロジェクト: tayoshan/IpythonNotebooks
def setup(data, trips, sep, cost, factors, constraints, prodCon, attCon, initialParams, Oi, Dj, totalFlows):

    #For doubly constrained model
    if (prodCon == True) & (attCon == True):

        #Variables for constants and deriving them
        data["Bj"] = 1.0
        data["Ai"] = 1.0
        data["OldAi"] = 10.000000000
        data["OldBj"] = 10.000000000
        data["diff"] = abs((data["OldAi"] - data["Ai"])/data["OldAi"])

        #Calc total outflows and inflows
        if Oi:
            print '1'
            data["Oi"] = data[Oi]
        else:
            print '2'
            Oi = data.groupby(data[constraints['production']]).aggregate({trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips]

        if Dj:
            print '3'
            data["Dj"] = data[Dj]
        else:
            print '4'
            Dj = data.groupby(data[constraints['attraction']]).aggregate({trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips]


    #For Production Constrained model
    if (prodCon == True) & (attCon == False):

        #Calc total outflows
        if factors == None:
            print Dj
            if not Dj:
                Dj = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Dj"] = Dj.ix[pd.match(data[totalFlows], Dj.index)].reset_index()[trips].sort_index()

            else:
                data["Dj"] = data[Dj]

        if not Oi:
            Oi = data.groupby(data[constraints['production']]).aggregate({trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips]
        else:
            data['Oi'] = data[Oi]


    #For Attraction Constrained model
    if (prodCon == False) & (attCon == True):

        #Calc total inflows
        if factors == None:
            if not Oi:
                Oi = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Oi"] = Oi.ix[pd.match(data[totalFlows], Oi.index)].reset_index()[trips]
            else:
                data["Oi"] = data[Oi]
        if not Dj:
            Dj = data.groupby(data[constraints['attraction']]).aggregate({trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips]
        else:
            data["Dj"] = data[Dj]


    #For Unconstrained Model
    if (prodCon == False) & (attCon == False):
        pass

    #The following setup is for within all models

    #There is always a beta parameter so set it to user's initial value and add to param list
    print initialParams
    data['beta'] = initialParams['beta']
    params = ['beta']

    #This is the observed data for which we want to derive parameters
    if cost == 'exp':
        knowns = data[sep]
    elif cost == 'pow':
        knowns = np.log(data[sep])
    else:
        sys.exit(sys.exit("The distance/cost function must be either 'pow' or 'exp'."))

    #If there are additional factors we will include that observed data, add it to param list, and add a data vector for the param
    if factors != None:
        if attCon != False:
            for factor in factors['origins']:
                #Include that information in the model
                knowns = knowns+np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                data[str(factor) + 'Param'] = initialParams[factor]
        if prodCon != False:
            for factor in factors['destinations']:
                #Include that informatio in the model
                knowns = knowns+np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                print initialParams
                data[str(factor) + 'Param'] = initialParams[factor]

    #Observed information is sum of trips multiplied by the log of known information
    observed = np.sum(data[trips]*knowns)


    #return observed info, data, knownn info, and params list
    return observed, data, knowns, params
コード例 #32
0
import pandas as pd

DATA_DIR = '/scratch/az338/ucc-fileserver/AZ_challenge_data/'

# challenge training data, cell/disease area(DA) mapping and challenge cmp/target mapping
challenge_pairs = pd.read_csv(DATA_DIR+'drug_synergy_data/ch1_train_combination_and_monoTherapy.csv')
cell_da_map = pd.read_csv(DATA_DIR+'sanger_molecular_data/cell_info.csv')
challenge_cmp_target_map = pd.read_csv(DATA_DIR+'drug_synergy_data/Drug_info_release_curated.csv')
challenge_cmp_target_map.columns = ['ChallengeName','Target'] + list(challenge_cmp_target_map.columns[2:])

# map disease area to corresponding cell-line
challenge_pairs['DISEASE_AREA'] = cell_da_map.iloc[pd.match(challenge_pairs['CELL_LINE'], cell_da_map['Sanger.Name'])]['Disease.Area'].tolist()

# make mapping flat (separate targets to different lines)
cmp_target_map = challenge_cmp_target_map[['ChallengeName','Target']]

flat_map = []
for i, r in cmp_target_map.iterrows():
    for t in r['Target'].split(','):
        flat_map.append([r['ChallengeName'],t.rstrip(' ').lstrip(' ')])

flat_map = pd.DataFrame(flat_map).drop_duplicates()
flat_map.columns = ['Compound','Target']

# convert compound-compound associations to target-target associations
synergy_scores = challenge_pairs[['DISEASE_AREA','COMPOUND_A','COMPOUND_B','SYNERGY_SCORE','QA','CELL_LINE']]
target_synergy = []
for i, r in synergy_scores.iterrows():
    #targets_A = flat_map.iloc[pd.match(r['COMPOUND_A'],flat_map['Compound'])]['Target']
    #targets_B = flat_map.iloc[pd.match(r['COMPOUND_B'],flat_map['Compound'])]['Target']
    targets_A = flat_map[flat_map['Compound'] == r['COMPOUND_A']]['Target']
コード例 #33
0



path = os.getenv('HOME')+'/python/phylogeny/pavelMattis/vector_machines/'


for f in [x for x in os.listdir(path+'data/list_length_project/sets/CognateData/output') if x!='.svn']:
    db = f.split('.')[0]
    data = pd.read_table(path+'data/list_length_project/sets/CognateData/output/'+f,encoding='utf-8')
    data = data[['-' not in unicode(x) for x in data.cognate_class.values]]
    output = pd.DataFrame()
    output['ID'] = arange(len(data))+1
    output['Taxon'] = data.language.astype('string')
    output['Gloss'] = data.gloss.values
    output['GlossID'] = pd.match(data.gloss.values,data.gloss.unique())+1
    output['IPA'] = [re.sub(r"[ -]","",unicode(x)) for x in data.transcription]
    output['Tokens'] = [' '.join(asjp2tokens(unicode(w))) for w in output.IPA]
    cClasses = array([x+':'+unicode(y).strip('?')
                      for (x,y) in data[['gloss','cognate_class']].values])
    output['CogID'] = pd.match(cClasses,unique(cClasses))
    output[['Taxon','Gloss']] = output[['Taxon','Gloss']].astype('string')
    output['dbID'] = [db+'_'+str(x-1) for x in output.ID.values]
    output.to_csv('reformattedData/asjp/'+db+'.tsv',encoding='utf-8',
                  sep='\t',index=False)

for f in [x for x in os.listdir(path+'data/list_length_project/sets/mattis_new/output') if x!='.svn']:
    db = f.split('.')[0]
    data = pd.read_table(path+'/data/list_length_project/sets/mattis_new/output/'+f,encoding='utf-8')
    data = data[['-' not in unicode(x) for x in data.cognate_class.values]]
    output = pd.DataFrame()
コード例 #34
0
ファイル: feature7.py プロジェクト: PhyloStar/svmcc
def _create_pandas_frame(dataset_path, samples_path, targets_path):
    """
	Creates and returns a pandas DataFrame object that includes the dataset's
	samples and targets. Also, the samples are augmented by calculating and
	adding the feature7 column.
	
	Note that the function requires paths as arguments instead of the data
	itself (which is why the temp dir is create in the calling add_feature7).
	This is for reasons that were once reasonable.
	"""
    fname = dataset_path.split('/')[-1]
    db = fname.split('.')[0]
    # read in wordlist
    wordlist = pd.read_table(dataset_path,
                             encoding='utf-8',
                             na_filter=False,
                             dtype=object)
    # keep track of synonyms within the same language
    synDict = defaultdict(lambda: 0)
    synocc = []
    for l, g in wordlist[['language', 'global_id']].values:
        synDict[l, g] += 1
        synocc.append(unicode(synDict[l, g]))
    wordlist['synonym_number'] = synocc
    dDict = {
        'sample_id': unicode,
        'feature1': double,
        'feature2': double,
        'feature3': double,
        'feature4': double,
        'feature5': double,
        'feature6': double,
        'feature8': double
    }
    # read in feature matrix for word pairs
    vectors = pd.read_table(samples_path,
                            encoding='utf-8',
                            na_filter=False,
                            dtype=dDict)
    # read in cognacy judgments
    labels = pd.read_table(targets_path,
                           encoding='utf-8',
                           na_filter=False,
                           dtype={
                               'sample_id': unicode,
                               'target': int
                           })
    # colect metadata for wordpairs in vectors
    metaRaw = array([x.split('/') for x in vectors.sample_id.values])
    meta = pd.DataFrame(c_[metaRaw[:,
                                   0], [x.split(',') for x in metaRaw[:, 1]],
                           [x.split(',') for x in metaRaw[:, 2]]],
                        columns=['global_id', 'l1', 'l2', 'id1', 'id2'])
    meta['sample_id'] = vectors.sample_id
    meta1 = pd.merge(wordlist[[
        'global_id', 'language', 'gloss', 'synonym_number', 'transcription',
        'cognate_class'
    ]],
                     meta,
                     left_on=['global_id', 'language', 'synonym_number'],
                     right_on=['global_id', 'l1', 'id1'])[[
                         'sample_id', 'global_id', 'l1', 'l2', 'transcription',
                         'cognate_class', 'id2'
                     ]]
    meta2 = pd.merge(wordlist[[
        'global_id', 'language', 'gloss', 'synonym_number', 'transcription',
        'cognate_class'
    ]],
                     meta1,
                     left_on=['global_id', 'language', 'synonym_number'],
                     right_on=['global_id', 'l2', 'id2'])[[
                         'sample_id', 'gloss', 'l1', 'transcription_y',
                         'cognate_class_y', 'l2', 'transcription_x',
                         'cognate_class_x'
                     ]]
    meta2.columns = [
        'sample_id', u'gloss', 'l1', u'w1', u'cc1', 'l2', u'w2', u'cc2'
    ]
    meta2 = meta2.ix[pd.match(vectors.sample_id, meta2.sample_id)]
    concepts = meta2.gloss.unique()
    feature7 = pd.Series([
        abs(
            corrcoef(
                array(
                    vectors[meta2.gloss == c][['feature2', 'feature4']].values,
                    double).T)[0, 1]) for c in concepts
    ],
                         index=concepts,
                         dtype=double)
    feature7[feature7.isnull()] = 0
    vectors['feature7'] = feature7.ix[meta2.gloss.values].values
    combined = pd.merge(pd.merge(meta2, vectors, on='sample_id'),
                        labels,
                        on='sample_id')
    combined = combined[combined.columns[1:]]
    combined['db'] = db

    return combined
コード例 #35
0
            '.wt1100.fasta.ref.vcf'
        ],
                   shell=True)  #create .evo1 .evo2 .evo3
        try:
            check_call([
                'python evoevalues.production.py ' + sys.argv[1] +
                '.wt1100.fasta.ref.vcf'
            ],
                       shell=True)  #create .evo.evalues
            dataevoe = pd.read_csv(sys.argv[1] +
                                   '.wt1100.fasta.ref.vcf.evo.evalues',
                                   delimiter=',',
                                   header=None)
            dataevoe[0] = 'chr' + dataevoe[0].astype(str)
            matchedinds = pd.match(
                np.asarray(coordata['chr'].astype(str) +
                           coordata['pos'].astype(str)),
                np.asarray(dataevoe[0].astype(str) + dataevoe[1].astype(str)))
            dataevoe = np.asarray(dataevoe.iloc[:, -4:])
            dataevoe = dataevoe[matchedinds, :]
            #impute evolutionary feature E-values for rare cases that evolutionary features are not available
            dataevoe[matchedinds == -1, :] = np.asarray([1, 1, 1,
                                                         1])[np.newaxis, :]
            datadeepsea = np.exp(
                np.mean(np.log(datae), axis=1) +
                np.mean(np.log(dataevoe), axis=1))
        except:
            datadeepsea = np.exp(np.mean(np.log(datae), axis=1))

        temp = pd.DataFrame(datadeepsea[:, np.newaxis])
        temp.columns = ['Functional significance score']
        datadeepsea = pd.concat([coordata, temp], axis=1)
コード例 #36
0
def testCluster(vdb,
                featureSubset=FEATURES,
                C=0.82,
                gamma=9e-04,
                kernel='linear',
                th=.34):
    """
	Inference on test data.
	"""
    newWordList = pd.DataFrame()
    fitting = trainingVectors
    validation = test[test.db == vdb].copy()
    X = fitting[featureSubset].values
    y = fitting.target.values
    svClf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
    svClf.fit(X, y)
    svScores = svClf.predict_proba(validation[featureSubset].values)[:, 1]
    validation['svScores'] = svScores
    scores = pd.DataFrame()
    wordlist = pd.DataFrame()
    concepts = validation.gloss.unique()
    taxa = unique(validation[['l1', 'l2']].values.flatten())
    dataWordlist = vstack([
        validation[['gloss', 'l1', 'w1', 'cc1']].values,
        validation[['gloss', 'l2', 'w2', 'cc2']].values
    ])
    dataWordlist = pd.DataFrame(
        dataWordlist, columns=['concept', 'doculect', 'counterpart', 'cc'])
    dataWordlist = dataWordlist.drop_duplicates()
    dataWordlist.index = [
        '_'.join(map(unicode, x))
        for x in dataWordlist[['concept', 'doculect', 'counterpart']].values
    ]
    validation['id_1'] = [
        c + '_' + l + '_' + unicode(w)
        for (c, l, w) in validation[['gloss', 'l1', 'w1']].values
    ]
    validation['id_2'] = [
        c + '_' + l + '_' + unicode(w)
        for (c, l, w) in validation[['gloss', 'l2', 'w2']].values
    ]
    for c in concepts:
        dataC = validation[validation.gloss == c].copy()
        dataC['id_1'] = [
            x.replace(' ', '').replace(',', '') for x in dataC.id_1
        ]
        dataC['id_2'] = [
            x.replace(' ', '').replace(',', '') for x in dataC.id_2
        ]
        wlC = dataWordlist[dataWordlist.concept == c].copy()
        if len(wlC) > 1:
            wlC.index = [
                x.replace(' ', '').replace(',', '') for x in wlC.index
            ]
            svMtx = zeros((len(wlC.index), len(wlC.index)))
            svMtx[pd.match(dataC.id_1, wlC.index),
                  pd.match(dataC.id_2, wlC.index)] = dataC.svScores.values
            svMtx[pd.match(dataC.id_2, wlC.index),
                  pd.match(dataC.id_1, wlC.index)] = dataC.svScores.values
            svDistMtx = log(1 - svMtx)
            tth = log(th) - svDistMtx.min()
            svDistMtx -= svDistMtx.min()
            fill_diagonal(svDistMtx, 0)
            pDict = infomap_clustering(tth, svDistMtx)
            pArray = vstack(
                [c_[pDict[k], [k] * len(pDict[k])] for k in pDict.keys()])
            partitionIM = pArray[argsort(pArray[:, 0]), 1]
        else:
            partitionIM = array([1])
        wlC['inferredCC'] = [vdb + ':' + c + ':' + str(x) for x in partitionIM]
        wlC['db'] = vdb
        newWordList = pd.concat([newWordList, wlC])
    newWordList.index = arange(len(newWordList))
    return newWordList
コード例 #37
0
from numpy import *
import pandas as pd

# this script computes Cronbach's alpha for all languages in the sample

data = pd.read_csv('conceptwiseSimilarities.csv', index_col=0)

concepts = array(data.columns[-40:])

taxa = unique(data[['language1', 'language2']].values.flatten())

nrMts = []
for c in concepts:
    cMtx = zeros((len(taxa), len(taxa)))
    ix1 = list(pd.match(data.language1, taxa))
    ix2 = list(pd.match(data.language2, taxa))
    cMtx[ix1, ix2] = data[c].values
    cMtx[ix2, ix1] = data[c].values
    nrMts.append(cMtx)

matrices = zeros((len(taxa), len(taxa), len(nrMts)))

for c in xrange(40):
    matrices[:, :, c] = nrMts[c]


def cronbach(x):
    itemwise = sum(apply_along_axis(var, 0, x))
    total = var(apply_along_axis(sum, 1, x))
    return 1. * len(x) / (len(x) - 1) * (1 - itemwise / total)
コード例 #38
0
#https://stackoverflow.com/questions/36063251/python-pandas-how-can-i-group-by-and-assign-an-id-to-all-the-items-in-a-group
df["b"] = LabelEncoder().fit_transform(df['g'])     #int count from 0
#https://stackoverflow.com/questions/41594703/pandas-assign-an-index-to-each-group-identified-by-groupby
df['b'] = pd.Categorical(df['a'].astype(str)).codes
df['b'] = pd.Categorical(df['a'].astype(str) + df['c'].astype(str)).codes #allow multiple col groups



#R: ind = order(v)
y = np.argsort(v)
y = v.argsort()

#R: match(v1, vdict) -> vdict[match(v1,vdict)] gives v1
np.searchsorted(vdict,v1) #if vdict is sorted
vdict[np.searchsorted(vdict,v1)] #gives v1
pd.match([1,2,3,5,8,2],[1,2,4,5,9,2])
match(c(1,2,3,5,8,2),c(1,2,4,5,9,2))

#R: d[order(v),]
d.reindex(np.argsort(d['c'])).reset_index(drop=True)
#R: setcolorder(d,new_col_order)
#https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
#d.reindex_axis(['a','b','c'], axis=1) #deprecated
d.reindex(['a','b','c'], axis=1)  #copy all data

d.sort_values(["a","b"], ascending = [True,False], inplace=False)
d.sort_values("a", ascending = True) #inplace is False default

s.sort_values() #series no need to add input
d['b'] = d.a.sort_values() # error merge to d by index, undoing the sort
d['b'] = d.a.sort_values().values  #correct
コード例 #39
0
for i in range(1,sheet.nrows):
    if sheet.cell_value(i,0) != '':
        date_value = sheet.cell_value(i,0)
        ticker = sheet.cell_value(i,2)
        dt = datetime.datetime(*xlrd.xldate_as_tuple(date_value,book.datemode))
        row_index.append(i)
        dates.append(dt.strftime("%Y%m%d"))
        tickers.append(ticker)

df = pd.DataFrame({'Date':pd.Series(dates, index=row_index),'Ticker':pd.Series(tickers, index=row_index)})
df['Ticker'] = df['Ticker'].apply(lambda x: x.replace('-',' '))
                                  
#Find Unique Tickers and Get Exchange Info from Yahoo!
unique_tickers = pd.unique(df['Ticker']).tolist()
ref_position = pd.match(df['Ticker'].tolist(),unique_tickers).tolist()
unique_tickers = [i.replace(' ','-') for i in unique_tickers]

print('[Reading Exchange Information from Yahoo! Finance]')
exchange_info = []

for i in range(len(unique_tickers)/query_limit):
    print('Reading ' + str((i+1)*query_limit))
    query_url = 'http://download.finance.yahoo.com/d/quotes.csv?s=' + '+'.join(unique_tickers[i*query_limit:(i+1)*query_limit]) + '&f=x'
    if len(exchange_info) == 0:
        exchange_info = pd.read_csv(query_url,header=None).iloc[:,0].tolist()
    else:
        exchange_info.extend(pd.read_csv(query_url,header=None).iloc[:,0].tolist())

if len(unique_tickers)%query_limit > 0:
    print('Reading ' + str(len(unique_tickers)))
コード例 #40
0
        index_end = int(np.minimum(
            (args.splitIndex) * np.ceil(float(h5f.shape[0] / 2) / args.splitFold), (h5f.shape[0] / 2)))
    else:
        index_start = 0
        index_end = int(h5f.shape[0] / 2)

    snp_temp = (np.asarray(h5f[index_start:index_end,:])+ np.asarray(h5f[index_start+int(h5f.shape[0]/2):index_end+int(h5f.shape[0]/2),:]))/2.0
    snpEffects.append(snp_temp)


coor = pd.read_csv(args.coorFile,sep='\t',header=None)
coor = coor.iloc[index_start:index_end,:]

#Fetch the distance to TSS information
gene = pd.read_csv(args.geneFile,sep='\t',header=None)
geneinds = pd.match(coor.iloc[:,0].map(str).str.replace('chr','')+' '+coor.iloc[:,1].map(str),
            gene.iloc[:,0].map(str).str.replace('chr','')+' '+gene.iloc[:,2].map(str))
if np.any(geneinds==-1):
    raise ValueError("Gene association file does not match the vcf file.")
if args.fixeddist == 0:
    dist = - np.asarray(gene.iloc[geneinds,-1])
else:
    dist = args.fixeddist
genename = np.asarray(gene.iloc[geneinds,-2])
strand= np.asarray(gene.iloc[geneinds,-3])

#comptue expression effects
snpExpEffects = compute_effects(snpEffects, \
                                dist, strand,\
                                models, maxshift=maxshift, nfeatures=args.nfeatures,
                                batchSize = args.batchSize)
#write output
コード例 #41
0
 def time_match_strings(self):
     pd.match(self.all, self.uniques)
コード例 #42
0
ファイル: algorithms.py プロジェクト: Winterflower/pandas
 def time_match_strings(self):
     pd.match(self.all, self.uniques)
コード例 #43
0
  
 fl = pd.read_csv(reportPath + fl)
 fl = fl.replace(np.nan,0).replace('',0)
 report = fl.reset_index(drop = True)
  
 climb = climb.replace(np.nan, 0).replace('',0)
 ## apply designated filters:
 climb = climb[ climb.vols1825 <= 0.7]
 climb = climb[ climb.rollingSTD730 <= 0.2]
  
 #climb = climb[ climb['skew'] > 0]
 #climb = climb[ climb.avgRectractAsGainPercent_180 <= 80]
 climb = pd.merge(climb,coins, on = 'ticker')
  
 for periodCol in ['180']:
     climb['change'] = ~(climb['climbOrRetract_' + periodCol] == report['climbOrRetract_' + periodCol][pd.match(climb.ticker,report.ticker)].values)
     changeReport = climb[['ticker','latestPrice','climbOrRetract_' + periodCol,'avgRectractAsGainPercent_' + periodCol,'currentAsPercentOfPrevious_' + periodCol,'targetFib_'+ periodCol,
                              'target_'+ periodCol,'targetGain_' + periodCol,'avgDaysClimbing_' + periodCol,'avgDaysRetracting_' + periodCol,
                              'gainFromMin_1_2','gainFromMin_2_4','gainFromMin_5_7','change']]
     changeReport = changeReport[ changeReport.change == True].reset_index(drop = True)
      
     if changeReport.shape[0] > 0:
         changeReport.to_csv('C:\\Users\\Nick\\Documents\\project MONEY\\Output Reports\\crypto\\daily\\changeReport_daily' + periodCol + '_' + dte + '.csv', sep =',', index = False)
          
         splitLen = int(np.ceil(changeReport.ticker.unique().shape[0] / 3))
          
         msgImageData = []
            
         n = 0
         for i in range(splitLen):
             if n < changeReport.ticker.unique().shape[0]:
コード例 #44
0
                  ]]
 meta2 = pd.merge(wordlist[[
     'global_id', 'language', 'gloss', 'synonym_number', 'transcription',
     'cognate_class'
 ]],
                  meta1,
                  left_on=['global_id', 'language', 'synonym_number'],
                  right_on=['global_id', 'l2', 'id2'])[[
                      'sample_id', 'gloss', 'l1', 'transcription_y',
                      'cognate_class_y', 'l2', 'transcription_x',
                      'cognate_class_x'
                  ]]
 meta2.columns = [
     'sample_id', u'gloss', 'l1', u'w1', u'cc1', 'l2', u'w2', u'cc2'
 ]
 meta2 = meta2.ix[pd.match(vectors.sample_id, meta2.sample_id)]
 concepts = meta2.gloss.unique()
 feature7 = pd.Series([
     abs(
         corrcoef(
             array(
                 vectors[meta2.gloss == c][['feature2', 'feature4']].values,
                 double).T)[0, 1]) for c in concepts
 ],
                      index=concepts,
                      dtype=double)
 feature7[feature7.isnull()] = 0
 vectors['feature7'] = feature7.ix[meta2.gloss.values].values
 combined = pd.merge(pd.merge(meta2, vectors, on='sample_id'),
                     labels,
                     on='sample_id')