Exemple #1
0
def getSJSuggestion(jobPoolLocal):
    global userDF, appDF, jobDF, jobPool, liveJobDF, liveJobDict, LiveJobs, JobUserSparseMatrix, LiveJobUserSparseMatrix
    jobPool = jobPoolLocal
    st = datetime.now()
    userDF = pd.DataFrame({'userId': appDF['userId'].unique()})
    appDF['userLookUp'] = pd.match(appDF['userId'], userDF['userId'])
    appDF['jobLookUp'] = pd.match(appDF['jobId'], jobDF['jobId'])
    appDF['liveJobLookUp'] = pd.match(appDF['jobId'], liveJobDF['jobId'])
    row = appDF['jobLookUp']
    col = appDF['userLookUp']
    data = np.repeat(1,appDF.shape[0])
    JobUserSparseMatrix = sp.coo_matrix((np.array(data), (np.array(row),np.array(col))), shape=(jobDF.shape[0], userDF.shape[0])) # 30L x 5L
    del row, col, data
    JobUserSparseMatrix = JobUserSparseMatrix.tocsr()
    row = appDF[~(appDF.liveJobLookUp == -1)]['liveJobLookUp']
    col = appDF[~(appDF.liveJobLookUp == -1)]['userLookUp']
    data = np.repeat(1,appDF[~(appDF.liveJobLookUp == -1)].shape[0])
    LiveJobUserSparseMatrix = sp.coo_matrix((np.array(data), (np.array(row),np.array(col))), shape=(liveJobDF.shape[0], userDF.shape[0])) # 30L x 1L
    del row, col, data
    LiveJobUserSparseMatrix = LiveJobUserSparseMatrix.tocsr()
    liveJobDict = dict(zip(liveJobDF.index, liveJobDF['jobId']))
    collectionJS.drop()
    collectionJSExport.drop()
    print 'JobSuggestions Count:', collectionJS.count(), 'JobSuggestionsExport Count:', collectionJSExport.count() 
    st1 = datetime.now()
    for jobPosition in range(0, JobUserSparseMatrix.shape[0], jobPool):
        getSJSuggestionPoolWise(jobPosition)
    print datetime.now() - st1
    print 'JobSuggestions Count:', collectionJS.count(), 'JobSuggestionsExport Count:', collectionJSExport.count()
    print "Run time :" + str(datetime.now() - st)  # 10 min
Exemple #2
0
def balanceFactors(data, sep, cost, factors, constraints, model):
    its = 0
    cnvg = 1
    while cnvg > .0001:
        its = its + 1
        if model != 'attConstrained':
            calcAi(data, sep, cost, factors, model)
            AiBF = (data.groupby(data[constraints['production']].name).aggregate({"Ai": np.sum}))
            AiBF["Ai"] = 1/AiBF["Ai"]
            updates = AiBF.ix[pd.match(data[constraints['production']], AiBF.index), "Ai"]
            data["Ai"] = updates.reset_index(level=0, drop=True) if(updates.notnull().any()) else data["Ai"]
            if model == 'prodConstrained':
                break
            if its == 1:
                data["OldAi"] = data["Ai"]
            else:
                data["diff"] = abs((data["OldAi"] - data["Ai"])/data["OldAi"])
                data["OldAi"] = data["Ai"]

        if model != 'prodConstrained':
            calcBj(data, sep, cost, factors, model)
            BjBF = data.groupby(data[constraints['attraction']].name).aggregate({"Bj": np.sum})
            BjBF["Bj"] = 1/BjBF["Bj"]
            updates = BjBF.ix[pd.match(data[constraints['attraction']], BjBF.index), "Bj"]
            data["Bj"] = updates.reset_index(level=0, drop=True) if(updates.notnull().any()) else data["Bj"]
            if its == 1:
                if model == 'attConstrained':
                    break
                data["OldBj"] = data["Bj"]
            else:
                data["diff"] = abs((data["OldBj"] - data["Bj"])/data["OldBj"])
                data["OldBj"] = data["Bj"]
        cnvg = np.sum(data["diff"])
        #print cnvg, its
    return data
Exemple #3
0
    def _load_data(self, x1, x2):
        final_data = np.zeros((x1.num_row, x2.num_col))
        final_row_labels = x1.row_labels
        final_col_labels = x2.col_labels

        prior_data = np.genfromtxt(self.data_file)
        prior_row_labels = self.prior_row_labels.tolist()
        assert prior_data.shape[0] == len(prior_row_labels)
        prior_col_labels = self.prior_col_labels.tolist()
        assert prior_data.shape[1] == len(prior_col_labels)

        prior_row_match_ind = pd.match(prior_row_labels, final_row_labels)
        prior_rows_to_transfer = [
            el for el in range(len(prior_row_labels))
            if prior_row_match_ind[el] != -1
        ]
        final_rows_to_fill = prior_row_match_ind[prior_row_match_ind != -1]
        prior_col_match_ind = pd.match(prior_col_labels, final_col_labels)
        prior_cols_to_transfer = [
            el for el in range(len(prior_col_labels))
            if prior_col_match_ind[el] != -1
        ]
        final_cols_to_fill = prior_col_match_ind[prior_col_match_ind != -1]
        final_data[np.ix_(final_rows_to_fill, final_cols_to_fill)] = \
            prior_data[prior_rows_to_transfer,:][:,prior_cols_to_transfer]

        self.row_labels = final_row_labels
        self.col_labels = final_col_labels

        self.data = final_data
Exemple #4
0
def category_transformation(train_categoric,
                            test_categoric,
                            labels,
                            type='std'):

    if type == 'freq':
        print("Encoding categories by freqency rank...")
        for c in train_categoric.columns:
            freqs = train_categoric[c].append(test_categoric[c]).value_counts()
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          freqs[0:91].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         freqs[0:91].index)

    if type == 'std':
        print("Encoding categories by sklearn label encoder...")
        for c in train_categoric.columns:
            lbl = LabelEncoder()
            lbl.fit(
                list(train_categoric.ix[:, c]) + list(test_categoric.ix[:, c]))
            train_categoric.ix[:, c] = lbl.transform(train_categoric.ix[:, c])
            test_categoric.ix[:, c] = lbl.transform(test_categoric.ix[:, c])

    if type == 'tgtrate':
        print("Encoding categories by target rate...")
        for c in train_categoric.columns:
            train_categoric[c], test_categoric[c] = category_to_prob_weight(
                train_categoric, test_categoric, c, labels)

    if type == 'rank':
        print("Encoding categories by rank transformation...")
        for c in train_categoric.columns:
            rank = pd.concat([train_categoric[c], labels],
                             axis=1).groupby(c).mean().sort_values(
                                 by='target', ascending=False)
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          rank[0:20000].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         rank[0:20000].index)

    if type == 'onehot':
        print("One hot... ")
        for c in train_categoric.columns:
            uniques = np.unique(train_categoric[c])
            if len(uniques) > 100:
                train_categoric.drop(c, axis=1, inplace=True)
                test_categoric.drop(c, axis=1, inplace=True)
        x_cat_train = train_categoric.T.to_dict().values()
        x_cat_test = test_categoric.T.to_dict().values()

        # vectorize
        vectorizer = DV(sparse=False)
        train_categoric = pd.DataFrame(vectorizer.fit_transform(x_cat_train))
        test_categoric = pd.DataFrame(vectorizer.transform(x_cat_test))

    return train_categoric, test_categoric
Exemple #5
0
def balanceFactors(data, sep, cost, factors, constraints, model):
    """
    calculate balancing factors and balance the balancing factors if doubly constrained model
    """
    its = 0
    cnvg = 1
    while cnvg > .0001:
        its = its + 1
        #If model is prod or doubly constrained
        if model != 'attConstrained':
            calcAi(data, sep, cost, factors, model)
            AiBF = (data.groupby(
                data[constraints['production']].name).aggregate({"Ai":
                                                                 np.sum}))
            AiBF["Ai"] = 1 / AiBF["Ai"]
            updates = AiBF.ix[
                pd.match(data[constraints['production']], AiBF.index), "Ai"]
            data["Ai"] = updates.reset_index(level=0, drop=True) if (
                updates.notnull().any()) else data["Ai"]
            #If model is prod constrained stop here - dont need to balance
            if model == 'prodConstrained':
                break
            if its == 1:
                data["OldAi"] = data["Ai"]
            else:
                data["diff"] = abs(
                    (data["OldAi"] - data["Ai"]) / data["OldAi"])
                data["OldAi"] = data["Ai"]
        #If model is att or doubly constrained
        if model != 'prodConstrained':
            calcBj(data, sep, cost, factors, model)
            BjBF = data.groupby(
                data[constraints['attraction']].name).aggregate({"Bj": np.sum})
            BjBF["Bj"] = 1 / BjBF["Bj"]
            updates = BjBF.ix[
                pd.match(data[constraints['attraction']], BjBF.index), "Bj"]
            data["Bj"] = updates.reset_index(level=0, drop=True) if (
                updates.notnull().any()) else data["Bj"]
            if its == 1:
                #If model is att constrained stop here - dont need to balance
                if model == 'attConstrained':
                    break
                data["OldBj"] = data["Bj"]
            else:
                data["diff"] = abs(
                    (data["OldBj"] - data["Bj"]) / data["OldBj"])
                data["OldBj"] = data["Bj"]
        cnvg = np.sum(data["diff"])
        #print cnvg, its
    return data
Exemple #6
0
def category_transformation(train_categoric, test_categoric, labels, type='std'):

	if type == 'freq':
		print("Encoding categories by freqency rank...")
		for c in train_categoric.columns:
			freqs = train_categoric[c].append(test_categoric[c]).value_counts()
			train_categoric[c] = pd.match(train_categoric[c].values, freqs[0:1000].index)
			test_categoric[c] = pd.match(test_categoric[c].values, freqs[0:1000].index)

	if type == 'tgtrate':
		print("Encoding categories by target rate...")
		for c in train_categoric.columns:
			train_categoric[c], test_categoric[c] = category_to_prob_weight(train_categoric, test_categoric, c, labels)

	return train_categoric, test_categoric
Exemple #7
0
def get_highest_reviews(bid):
    other_reviewers_of_restaurant=list(yelp['user_id'][yelp['business_id']==bid])
    uid=random.sample(other_reviewers_of_restaurant, 1)
    user_column=pandas.match(uid, users)[0]
    similarity_indices_for_user=list(df.ix[:,user_column])
    z=numpy.array(similarity_indices_for_user)
    most_similar_users=numpy.argsort(z)[0:10]
    most_similar_users=[users[most_similar_users[i]] for i in range(10)]
    name_rest=yelp['name.business'][yelp['business_id']==bid].unique()[0]
    f =  lambda row: row['user_id'] in most_similar_users and row['name.business'] in name_rest
    k = yelp.apply(f, axis=1)
    temp=yelp[k]
    temp.iloc[:,[1,3,6,9,11,17,21,27]]
    x1=list(temp['stars.review']); x2=list(temp['richness']); x3=list(temp['fans'])
    x4=list(temp['review_count.review']); x5=list(temp['stars.business'])
    predicted_values=list()
    for i in range(len(temp)):
        predicted_values.append([predict_expected_value(x1[i],x2[i],x3[i],x4[i],x5[i]), i])
    predicted_values.sort(key=lambda x: x[0])
    predicted_values=predicted_values[-3:]
    predicted_values.sort(key=lambda x: x[1])
    reviews=list()
    for value in range(len(predicted_values)):
        row_of_review=predicted_values[value][1]
        print(row_of_review)
        reviews.append(temp['text'].iloc[row_of_review])
    return reviews
Exemple #8
0
def total_flows(dt, f, locs):
    """
    sum rows or columns to derive total inflows or total outflows
    """

    totals = dt.groupby(locs).aggregate({f: np.sum})
    return totals.ix[pd.match(locs, totals.index.astype(str))].reset_index()[f]
Exemple #9
0
def explain_prediction(bst, explainer, data):
    """

    :param bst:
    :type bst: xgb.Booster
    :param explainer:
    :type explainer: pd.DataFrame
    :param data:
    :return:
    """
    nodes = bst.predict(data, pred_leaf=True)
    colnames = list(explainer.columns.values)[:-2]

    preds_breakdown = pd.DataFrame(np.zeros((nodes.shape[0], len(colnames))),
                                   columns=colnames)

    print("Extracting the breakdown of each prediction...")
    num_trees = nodes.shape[1]
    with click.progressbar(range(num_trees), num_trees) as bar:
        for idx in bar:
            nodes_for_tree = nodes[:, idx]
            tree_breakdown = explainer[explainer["tree"] == idx].fillna(0)
            preds_breakdown_for_tree = tree_breakdown.loc[
                pd.match(nodes_for_tree, tree_breakdown["leaf"])][colnames] \
                .reset_index(drop=True)
            preds_breakdown = preds_breakdown + preds_breakdown_for_tree
    print("DONE!")
    return preds_breakdown
Exemple #10
0
def parse_usearch_allpairs(filename, seqnames):
    """Read output of ``usearch -allpairs_global -blast6out`` and return a
    square distance matrix. ``seqnames`` determines the marginal order
    of sequences in the matrix.

    """

    data = pd.read_table(filename, header=None, names=BLAST6NAMES)
    data['dist'] = pd.Series(
        1.0 - data['pct_id'] / 100.0, index=data.index)

    # for each sequence pair, select the longest alignment if there is
    # more than one (chooses first occurrence if there are two the same
    # length).
    maxidx = data.groupby(['query', 'target']).apply(
        lambda x: x['align_len'].idxmax())
    data = data.iloc[maxidx]

    if set(seqnames) != set(data['query']) | set(data['target']):
        # shutil.copy(filename, '.')
        raise UsearchError(
            'some sequences are missing from the output ({})'.format(filename))

    nseqs = len(seqnames)
    distmat = numpy.repeat(0.0, nseqs ** 2)
    distmat.shape = (nseqs, nseqs)
    ii = pd.match(data['query'], seqnames)
    jj = pd.match(data['target'], seqnames)

    # usearch_allpairs_files returns comparisons corresponding to a
    # triangular matrix, whereas vsearch_allpairs_files returns all
    # comparisons. Here we convert both to a square matrix.
    if data.shape[0] == nseqs * nseqs:
        distmat[ii, jj] = data['dist']
    elif data.shape[0] == (nseqs * (nseqs - 1)) / 2:
        distmat[ii, jj] = data['dist']
        distmat[jj, ii] = data['dist']
    else:
        msg = 'not all pairwise comparisons are represented ({})'
        raise UsearchError(msg.format(filename))

    return distmat
Exemple #11
0
def parse_usearch_allpairs(filename, seqnames):
    """Read output of ``usearch -allpairs_global -blast6out`` and return a
    square distance matrix. ``seqnames`` determines the marginal order
    of sequences in the matrix.

    """

    data = pd.read_table(filename, header=None, names=BLAST6NAMES)
    data['dist'] = pd.Series(
        1.0 - data['pct_id'] / 100.0, index=data.index)

    # for each sequence pair, select the longest alignment if there is
    # more than one (chooses first occurrence if there are two the same
    # length).
    maxidx = data.groupby(['query', 'target']).apply(
        lambda x: x['align_len'].idxmax())
    data = data.iloc[maxidx]

    if set(seqnames) != set(data['query']) | set(data['target']):
        # shutil.copy(filename, '.')
        raise UsearchError(
            'some sequences are missing from the output ({})'.format(filename))

    nseqs = len(seqnames)
    distmat = numpy.repeat(0.0, nseqs ** 2)
    distmat.shape = (nseqs, nseqs)
    ii = pd.match(data['query'], seqnames)
    jj = pd.match(data['target'], seqnames)

    # usearch_allpairs_files returns comparisons corresponding to a
    # triangular matrix, whereas vsearch_allpairs_files returns all
    # comparisons. Here we convert both to a square matrix.
    if data.shape[0] == nseqs * nseqs:
        distmat[ii, jj] = data['dist']
    elif data.shape[0] == (nseqs * (nseqs - 1)) / 2:
        distmat[ii, jj] = data['dist']
        distmat[jj, ii] = data['dist']
    else:
        msg = 'not all pairwise comparisons are represented ({})'
        raise UsearchError(msg.format(filename))

    return distmat
def getUserAppsDF():
    global jobDF, userDF, userAppsDF, userAppsDict, UserAppsSparseMatrix
    userAppsDF = pd.read_csv(projectHomeSJ +
                             "/Input/ApplicationData_sorted.csv",
                             names=['userId', 'jobId'])
    userAppsDF = userAppsDF[userAppsDF.jobId.isin(jobDF.job)]
    userDF = pd.DataFrame({'userId': userAppsDF.userId.unique()})
    userAppsDF['userLookUp'] = pd.match(userAppsDF['userId'], userDF['userId'])
    userAppsDF['jobLookUp'] = pd.match(userAppsDF['jobId'], jobDF['job'])
    userAppsDF_1 = pd.DataFrame(
        list(collectionCA.find({}, {
            '_id': 1,
            'userApps': 1
        })))
    userAppsDict = dict(zip(userAppsDF_1['_id'], userAppsDF_1['userApps']))
    del userAppsDF_1
    UserAppsSparseMatrix = sp.coo_matrix(
        (np.repeat(1, userAppsDF.shape[0]),
         (userAppsDF['userLookUp'], userAppsDF['jobLookUp'])),
        shape=(userDF.shape[0], jobDF.shape[0]))
    UserAppsSparseMatrix = UserAppsSparseMatrix.tocsr()
Exemple #13
0
def memo_test(events, selected_genes, groups, permutations=10000):
    groups_memo = [pandas.match(group, selected_genes) for group in groups]
    events_selected = events[selected_genes]
    sampler = switching.EventMatrixSampler(events_selected.astype(int),
                                           "gobbi")

    coverages = numpy.array(
        [events_selected[i].any(0).sum() for i in groups_memo])
    higher_coverage = numpy.zeros_like(coverages)

    for i in xrange(permutations):
        null_sample = sampler.sample()
        higher_coverage += (numpy.array(
            [null_sample[i].any(0).sum()
             for i in groups_memo]) >= coverages).astype(int)

    return (higher_coverage + 1.0) / (permutations + 1.0)
Exemple #14
0
    def calc_Bj(self, dt, d, of, p, dc=False):
        """
        calculate Bj balancing factor
        """
        Bj = self.calc_dcy(self.c, self.cf, p)

        if of:
            for fx in of:
                Bj *= of[fx]**p[fx]
        if not dc:
            dt['Bj'] = Bj
        else:
            dt['Bj'] = Bj*dt['Ai']*dt['Oi']
        Bj = (dt.groupby(d).aggregate({'Bj': np.sum}))
        Bj['Bj'] = 1/Bj['Bj']
        Bj = Bj.ix[pd.match(d, Bj.index), 'Bj']
        return Bj.reset_index(level=0, drop=True)
Exemple #15
0
    def calc_Ai(self, dt, o, df, p, dc=False):
        """
        calculate Ai balancing factor
        """
        Ai = self.calc_dcy(self.c, self.cf, p)

        if df:
            for fx in df:
                Ai *= df[fx]**p[fx]

        if not dc:
            dt['Ai'] = Ai
        else:
            dt['Ai'] = Ai*dt['Bj']*dt['Dj']
        Ai = (dt.groupby(o).aggregate({'Ai': np.sum}))
        Ai['Ai'] = 1/Ai['Ai']
        Ai = Ai.ix[pd.match(o, Ai.index), 'Ai']
        return Ai.reset_index(level=0, drop=True)
Exemple #16
0
def comp(vector):

    # Function that computes the distinct observations in a numeric vector.
    # It is based entirely on the "comp11" function from the BNPTSclust
    # package in R created by David Alejandro Martell Juarez
    #
    # IN:
    #
    # vector <- numeric vector.
    #
    # OUT:
    #
    # jstar <- variable that rearranges the input vector into a vector with only
    #          its unique values.
    # nstar <- frequency of each distinct observation in the input vector.
    # rstar <- number of distinct observations in the input vector.
    # gn    <- variable that indicates the group number to which every
    #          entry in the input vector belongs.

    n = len(vector)

    mat = vector[:, None] == vector

    jstar = np.repeat(False, n)

    led = np.repeat(False, n)

    for j in np.arange(0, n):
        if not led[j]:
            jstar[j] = True
            if j + 1 == n:
                break
            ji = np.arange(j + 1, n)
            tt = mat[ji, j] == True
            led[ji] = led[ji] | tt
        if all(np.delete(led, np.arange(0, j + 1))):
            break

    ystar = vector[jstar]
    nstar = np.apply_along_axis(np.sum, 0, mat[:, jstar])
    rstar = len(nstar)
    gn = pd.match(vector, ystar)

    return jstar, nstar, rstar, gn
            if s >= .6 : cn+=1
        if cn > 0:
            res.append([se_id[i],unip,tname,cn])
    return res

compound_target_mapping = []
for i,r in target_pairs.iterrows():
    if i%200 == 0 and i > 0:
        print i

    # target names/symbols
    t1 = r['TARGET_A']
    t2 = r['TARGET_B']

    # get corresponding uniprots
    unip1 = targets.iloc[pd.match([t1],targets['SYMBOL'])]['UNIPROT'].tolist()[0]
    unip2 = targets.iloc[pd.match([t2],targets['SYMBOL'])]['UNIPROT'].tolist()[0]

    # get twosides compounds that have >.6 similarity to
    # at least one target active in ChEMBL
    se_cp1 = get_se_compound(unip1,t1)
    se_cp2 = get_se_compound(unip2,t2)
    
    if len(se_cp1) > 0 and len(se_cp2) > 0:
        # compound_target_mapping.append(se_cp1+se_cp2)

        for v1 in se_cp1 : # for 1st target/stitch compounds association
            for v2 in se_cp2: # for 2nd target/stitch compounds association
                # if both the compound for the 1st target and the compound for the 2nd target
                # are in the twosides dataset 
                if v1[0] in SE_data['stitch_id1'] and v2[0] in SE_data['stitch_id2']: 
Exemple #18
0
 def test_match(self):
     with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
         pd.match([1, 2, 3], [1])
Exemple #19
0
 def test_match(self):
     with tm.assert_produces_warning(FutureWarning,
                                     check_stacklevel=False):
         pd.match([1, 2, 3], [1])
Exemple #20
0
def setup(data, trips, sep, cost, factors, constraints, prodCon, attCon,
          initialParams, Oi, Dj, totalFlows):
    """
    set up all initial variables and balancing factors for mle calibration
    """

    #The following setup is for within all models

    #There is always a beta parameter so set it to user's initial value and add to param list
    data['beta'] = initialParams['beta']
    params = ['beta']

    #This is the observed data for which we want to derive parameters
    if cost == 'exp':
        knowns = data[sep]
    elif cost == 'pow':
        knowns = np.log(data[sep])
    else:
        sys.exit(
            sys.exit(
                "The distance/cost function must be either 'pow' or 'exp'."))

    #For doubly constrained model
    if (prodCon == True) & (attCon == True):

        #Variables for constants and deriving them
        data["Bj"] = 1.0
        data["Ai"] = 1.0
        data["OldAi"] = 10.000000000
        data["OldBj"] = 10.000000000
        data["diff"] = abs((data["OldAi"] - data["Ai"]) / data["OldAi"])

        #Calc total outflows and inflows
        if Oi:
            data["Oi"] = data[Oi]
        else:
            Oi = data.groupby(data[constraints['production']]).aggregate(
                {trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']],
                                        Oi.index)].reset_index()[trips]

        if Dj:
            data["Dj"] = data[Dj]
        else:
            Dj = data.groupby(data[constraints['attraction']]).aggregate(
                {trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']],
                                        Dj.index)].reset_index()[trips]

    #For Production Constrained model
    if (prodCon == True) & (attCon == False):

        #Calc total outflows
        if factors == None:
            if not Dj:
                Dj = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Dj"] = Dj.ix[pd.match(
                    data[totalFlows],
                    Dj.index)].reset_index()[trips].sort_index()

            else:
                data["Dj"] = data[Dj]

        if not Oi:
            Oi = data.groupby(data[constraints['production']]).aggregate(
                {trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']],
                                        Oi.index)].reset_index()[trips]
        else:
            data['Oi'] = data[Oi]

    #For Attraction Constrained model
    if (prodCon == False) & (attCon == True):

        #Calc total inflows
        if factors == None:
            if not Oi:
                Oi = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Oi"] = Oi.ix[pd.match(data[totalFlows],
                                            Oi.index)].reset_index()[trips]
            else:
                data["Oi"] = data[Oi]
        if not Dj:
            Dj = data.groupby(data[constraints['attraction']]).aggregate(
                {trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']],
                                        Dj.index)].reset_index()[trips]
        else:
            data["Dj"] = data[Dj]

    #For Unconstrained Model
    if (prodCon == False) & (attCon == False):
        for factor in factors['origins']:
            #Include that information in the model
            knowns = knowns + np.log(data[factor])
            #Add to params list
            params.append(str(factor))
            #variable param vector
            data[str(factor) + 'Param'] = initialParams[factor]
        for factor in factors['destinations']:
            #Include that informatio in the model
            knowns = knowns + np.log(data[factor])
            #Add to params list
            params.append(str(factor))
            #variable param vector
            data[str(factor) + 'Param'] = initialParams[factor]

    #For all models besides unconstrained - is probably redundant and can be refactored

    #If there are additional factors we will include that observed data, add it to param list, and add a data vector for the param
    if factors != None:
        if attCon != False:
            for factor in factors['origins']:
                #Include that information in the model
                knowns = knowns + np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                data[str(factor) + 'Param'] = initialParams[factor]
        if prodCon != False:
            for factor in factors['destinations']:
                #Include that informatio in the model
                knowns = knowns + np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                data[str(factor) + 'Param'] = initialParams[factor]

    #Observed information is sum of trips multiplied by the log of known information
    observed = np.sum(data[trips] * knowns)

    #return observed info, data, knownn info, and params list
    return observed, data, knowns, params
Exemple #21
0
def load(m_params):
    num_features = m_params['n_features']
    minbin = m_params['minbin']
    getcached = m_params['getcached']
    codetest = m_params['codetest']

    trainfilename = 'train_' + str(num_features) + str(minbin) + '.h5'
    testfilename = 'test_' + str(num_features) + str(minbin) + '.h5'

    # Read HDF format file
    print("1a. Reading the train and test data...\n")
    if getcached and os.path.isfile('input/' + trainfilename):

        train = pd.read_hdf('input/' + trainfilename, 'train')
        test = pd.read_hdf('input/' + testfilename, 'test')
        labels = train['target']
        test_ids = test['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

        return train.values, labels.values, test.values, test_ids.values

    else:
        train = pd.read_hdf('input/train.h5', 'train')
        test = pd.read_hdf('input/test.h5', 'test')

        if codetest:
            train = train.ix[0:999, :]
            test = test.ix[0:999, :]

        labels = train['target']
        test_ids = test['ID']
        train_ids = train['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

        print(
            "1c. Breaking dataframe into numeric, object and date parts...\n")
        train_numeric = train.select_dtypes(include=['float64', 'int64'])
        test_numeric = test.select_dtypes(include=['float64', 'int64'])

        train_categoric = train.select_dtypes(include=['object'])
        test_categoric = test.select_dtypes(include=['object'])

        train_dates = train.select_dtypes(include=['datetime64[ns]'])
        test_dates = test.select_dtypes(include=['datetime64[ns]'])

        # Zip code engineering
        print("2. Zip code engineering...\n")
        train['VAR_0241'] = train['VAR_0241'].fillna(99999)
        test['VAR_0241'] = test['VAR_0241'].fillna(99999)
        train_zips = np.empty([train.shape[0], 7])
        test_zips = np.empty([test.shape[0], 7])
        try:
            zp = train['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            train_zips[:, 0] = zp.map(lambda x: x[:2]).astype('int32')
            train_zips[:, 1] = zp.map(lambda x: x[:1] + x[-1:]).astype('int32')
            train_zips[:, 2] = zp.map(lambda x: x[:3]).astype('int32')
            train_zips[:, 3] = zp.map(lambda x: x[1:3]).astype('int32')
            train_zips[:, 4] = zp.map(lambda x: x[1:4]).astype('int32')
            train_zips[:, 5] = zp.map(lambda x: x[2:4]).astype('int32')
            train_zips[:, 6] = zp.map(lambda x: x[3:5]).astype('int32')
            zp = test['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            test_zips[:, 0] = zp.map(lambda x: x[:2]).astype('int32')
            test_zips[:, 1] = zp.map(lambda x: x[:1] + x[-1:]).astype('int32')
            test_zips[:, 2] = zp.map(lambda x: x[:3]).astype('int32')
            test_zips[:, 3] = zp.map(lambda x: x[1:3]).astype('int32')
            test_zips[:, 4] = zp.map(lambda x: x[1:4]).astype('int32')
            test_zips[:, 5] = zp.map(lambda x: x[2:4]).astype('int32')
            test_zips[:, 6] = zp.map(lambda x: x[3:5]).astype('int32')

            zipcolumns = [
                'zip0', 'zip1', 'zip2', 'zip3', 'zip4', 'zip5', 'zip6'
            ]
            train_zips = pd.DataFrame(train_zips, columns=zipcolumns)
            test_zips = pd.DataFrame(test_zips, columns=zipcolumns)
        except:
            print('Zip codes cant be encoded')
            exit()

        # Deal with categorical data
        print("3. Categorical variable encoding... \n")
        for c in train_categoric.columns:
            freqs = train_categoric[c].append(test_categoric[c]).value_counts()
            train_categoric[c] = pd.match(train_categoric[c].values,
                                          freqs[0:70].index)
            test_categoric[c] = pd.match(test_categoric[c].values,
                                         freqs[0:70].index)

        # Deal with categorical data
        print("4. Numeric Column Smoothing... \n")
        train_numeric = train_numeric.fillna(0)
        test_numeric = test_numeric.fillna(0)
        numeric_col_count = 0
        if minbin > 1:
            for c in train_numeric.columns:
                train_numeric[c], test_numeric[c] = bin(
                    train_numeric[c], test_numeric[c], labels, minbin)
                numeric_col_count += 1
                if not (numeric_col_count % 10):
                    print('Numeric Col Count: ', numeric_col_count)

        gc.collect()

        # Create new date transformations
        print('5. Create new date columns...\n')

        def tdtoint(td):
            if not pd.isnull(td):
                return td.astype('timedelta64[D]').astype(np.int32)
            else:
                return 0

        # Diffs between important dates
        for i in [
                'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                'VAR_0169', 'VAR_0178', 'VAR_0166'
        ]:
            for j in [
                    'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                    'VAR_0169', 'VAR_0178', 'VAR_0166'
            ]:
                if i < j:
                    keypair = i + '_' + j
                else:
                    keypair = j + '_' + i
                if i != j and keypair not in train_dates.columns:
                    train_dates[keypair] = train_dates[i] - train[j]
                    train_dates[keypair] = train_dates[keypair].apply(tdtoint)
                    test_dates[keypair] = test_dates[i] - test_dates[j]
                    test_dates[keypair] = test_dates[keypair].apply(tdtoint)

        # Date Splits
        datecols = pd.read_pickle('input/datecols.pkl')
        for c in datecols['col'].values.tolist():
            train_dates[c + '_y'] = train_dates[c].dt.year
            train_dates[c + '_m'] = train_dates[c].dt.month
            train_dates[c + '_d'] = train_dates[c].dt.day
            train_dates[c + '_wd'] = train_dates[c].dt.weekday
            train_dates[c + '_hr'] = train_dates[c].dt.hour
            test_dates[c + '_y'] = test_dates[c].dt.year
            test_dates[c + '_m'] = test_dates[c].dt.month
            test_dates[c + '_d'] = test_dates[c].dt.day
            test_dates[c + '_wd'] = test_dates[c].dt.weekday
            test_dates[c + '_hr'] = test_dates[c].dt.hour

        train_dates.drop(datecols['col'].values.tolist(), axis=1, inplace=True)
        test_dates.drop(datecols['col'].values.tolist(), axis=1, inplace=True)

        gc.collect()

    print("5. Merging arrays together...\n")
    # put seperate parts together again

    train = pd.concat(
        [train_categoric, train_dates, train_numeric, train_zips], axis=1)
    test = pd.concat([test_categoric, test_dates, test_numeric, test_zips],
                     axis=1)

    # Get only top n features
    print("1b. Filtering by pickled important columns...\n")
    cols = pd.read_pickle("input/vars_importance.pkl")
    cols = list(cols.ix[0:num_features, "var"])

    for c in cols:
        if c not in train.columns:
            cols.remove(c)

    train = train[cols].fillna(0)
    test = test[cols].fillna(0)

    gc.collect()

    try:
        print("6. Writing to hdf format...\n")
        pd.concat([train_ids, train, labels],
                  axis=1).to_hdf('input/' + trainfilename,
                                 key='train',
                                 format='fixed',
                                 mode='w')
        pd.concat([test_ids, test], axis=1).to_hdf('input/' + testfilename,
                                                   key='test',
                                                   format='fixed',
                                                   mode='w')
    except:
        error = sys.exc_info()[0]
        print("Error: %s" % error)

    return train.values, labels.values, test.values, test_ids.values
sider_se = sider_se[sider_se[3] == 'PT']

#sider_fq = pd.read_table('/home/az338/ucc-fileserver/sider/meddra_freq.tsv',header=None)
#sider_fq = sider_fq.drop(0,1)


# concatenate sider and offsides side effect
sider_se = sider_se.drop([3,4],1)
sider_se.columns = ['stitch_id','umls_id','event']
allSE = pd.concat([sider_se,offsides[['stitch_id','umls_id','event']]])
allSE.to_csv(DATA_DIR+'sider_and_offsides_SE.csv')

# load stitch_id  smiles mapping dataset 
stitch_smiles = pd.read_table('/home/az338/ucc-fileserver/stitch_v4/chemicals.v4.0.tsv')


# match stitch id in SE dataset to smiles
idx_offsides = pd.match(offsides['stitch_id'],stitch_smiles['chemical'])
idx_offsides = idx_offsides[idx_offsides >= 0]

idx_sider = pd.match(sider_se['stitch_id'],stitch_smiles['chemical'])
idx_sider = idx_sider[idx_sider >= 0]

# concatenate sider and offsides structures
offsides_struct = stitch_smiles.iloc[idx_offsides][['chemical','SMILES_string']].drop_duplicates()
sider_struct = stitch_smiles.iloc[idx_sider][['chemical','SMILES_string']].drop_duplicates()
all_struct = pd.concat([offsides_struct,sider_struct]).drop_duplicates()
all_struct.to_csv(DATA_DIR+'sider_offsides_smiles.csv')


import pandas as pd


DATA_DIR = '/home/az338/ucc-fileserver/AZ_challenge_data/'
SE_DATA_DIR = '/scratch/az338/ucc-fileserver/side_effects_data/'

# load twosides
SE_data = pd.read_table(SE_DATA_DIR+'3003377s-twosides.tsv')

# load compound structures 
SE_struct = pd.read_table('/home/az338/ucc-fileserver/stitch_v4/chemicals.v4.0.tsv')

# get unique compound identifiers in twosides
twosides_compounds = list(set(SE_data['stitch_id1'].values.tolist()+SE_data['stitch_id2'].values.tolist()))

# match these compounds to the OFFSIDES/SIDER structural dataset 
# to match the stitch IDs to their structure
idx = pd.match(twosides_compounds,SE_struct['chemical'])
twosides_struct = SE_struct.iloc[idx[idx>0]]
twosides_struct=twosides_struct.drop('molecular_weight',axis=1)

twosides_struct.to_csv(SE_DATA_DIR+'twosides_smiles.csv')
Exemple #24
0
 def time_match_string(self):
     with warnings.catch_warnings(record=True):
         pd.match(self.all, self.uniques)
Exemple #25
0
def load(m_params):

    num_features = m_params['n_features']
    minbin = m_params['minbin']
    getcached = m_params['getcached']

    t0 = time.time()

    trainfilename = 'train_' + str(num_features) + str(minbin) + '.h5'
    testfilename = 'test_' + str(num_features) + str(minbin) + '.h5'

    # Read HDF format file
    print("1. Reading the train and test data...\n")

    if getcached and os.path.isfile(trainfilename):

        train = pd.read_hdf(trainfilename, 'train')
        test = pd.read_hdf(testfilename, 'test')
        labels = train['target']
        test_ids = test['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

        return train.values, labels.values, test.values, test_ids.values

    elif getcached and os.path.isfile('train_binned_' + str(minbin) + '.h5'):

        train = pd.read_hdf('train_binned_' + str(minbin) + '.h5', 'train')
        test = pd.read_hdf('test_binned_' + str(minbin) + '.h5', 'test')
        labels = train['target']
        test_ids = test['ID']
        train.drop(['ID', 'target'], axis=1, inplace=True)
        test.drop(['ID'], axis=1, inplace=True)

    else:

        train = pd.read_hdf('train.h5', 'train')
        test = pd.read_hdf('test.h5', 'test')

        labels = train['target']
        test_ids = test['ID']

        gc.collect()

        print("Postcode column \n")
        print(train['VAR_0241'].dtype, len(np.unique(train['VAR_0241'])))
        print(test['VAR_0241'].dtype, len(np.unique(test['VAR_0241'])))

        # Zip code engineering
        print("4. Zip code engineering...\n")
        train['VAR_0241'] = train['VAR_0241'].fillna(99999)
        test['VAR_0241'] = test['VAR_0241'].fillna(99999)
        try:
            zp = train['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            train['zip_00xxx'] = zp.map(lambda x: x[:2]).astype('int32')
            train['zip_0xxx0'] = zp.map(lambda x: x[:1] + x[-1:]).astype(
                'int32')
            train['zip_000xx'] = zp.map(lambda x: x[:3]).astype('int32')
            train['zip_x00xx'] = zp.map(lambda x: x[1:3]).astype('int32')
            train['zip_x000x'] = zp.map(lambda x: x[1:4]).astype('int32')
            train['zip_xx00x'] = zp.map(lambda x: x[2:4]).astype('int32')
            train['zip_xxx00'] = zp.map(lambda x: x[3:5]).astype('int32')
            zp = test['VAR_0241'].astype('int64').astype(str)
            zp = zp.replace('', '99999')
            test['zip_00xxx'] = zp.map(lambda x: x[:2]).astype('int32')
            test['zip_0xxx0'] = zp.map(lambda x: x[:1] + x[-1:]).astype(
                'int32')
            test['zip_000xx'] = zp.map(lambda x: x[:3]).astype('int32')
            test['zip_x00xx'] = zp.map(lambda x: x[1:3]).astype('int32')
            test['zip_x000x'] = zp.map(lambda x: x[1:4]).astype('int32')
            test['zip_xx00x'] = zp.map(lambda x: x[2:4]).astype('int32')
            test['zip_xxx00'] = zp.map(lambda x: x[3:5]).astype('int32')
        except:
            print('BOLLOCKS Zip codes cant be encoded')
            exit()

        # Deal with categorical data and smoothing
        print(
            "2. Categorical variable encoding and numeric col smoothing... \n")
        numeric_col_count = 0
        for c in train.columns[1:-1]:
            if train[c].name != 'target':
                if train[c].dtype.name == 'object':
                    freqs = train[c].append(test[c]).value_counts()
                    train[c] = pd.match(train[c].values, freqs[0:70].index)
                    test[c] = pd.match(test[c].values, freqs[0:70].index)
                elif train[c].dtype.name in ['int64', 'float64'
                                             ] and minbin > 1:
                    # smooth numeric cols
                    train[c] = bin(train[c], train[c], train['target'], minbin)
                    test[c] = bin(test[c], train[c], train['target'], minbin)
                    numeric_col_count += 1
                    if not (numeric_col_count % 10):
                        print('Numeric Col Count: ', numeric_col_count)

        gc.collect()

        # Create new date transformations
        print('3. Create new date columns...\n')

        def tdtoint(td):
            if not pd.isnull(td):
                return td.astype('timedelta64[D]').astype(int)
            else:
                return 0

        # Diffs between important dates
        for i in [
                'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                'VAR_0169', 'VAR_0178', 'VAR_0166'
        ]:
            for j in [
                    'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217',
                    'VAR_0169', 'VAR_0178', 'VAR_0166'
            ]:
                if i < j:
                    keypair = i + '_' + j
                else:
                    keypair = j + '_' + i
                if i != j and keypair not in train.columns:
                    train[keypair] = train[i] - train[j]
                    train[keypair] = train[keypair].apply(tdtoint)
                    test[keypair] = test[i] - test[j]
                    test[keypair] = test[keypair].apply(tdtoint)

        # Date Splits
        datecols = pd.read_pickle('datecols.pkl')
        for c in datecols['col'].values.tolist():
            train[c + '_y'] = train[c].dt.year
            train[c + '_m'] = train[c].dt.month
            train[c + '_d'] = train[c].dt.day
            train[c + '_wd'] = train[c].dt.weekday
            train[c + '_hr'] = train[c].dt.hour
            test[c + '_y'] = test[c].dt.year
            test[c + '_m'] = test[c].dt.month
            test[c + '_d'] = test[c].dt.day
            test[c + '_wd'] = test[c].dt.weekday
            test[c + '_hr'] = test[c].dt.hour

        train.drop(datecols['col'].values.tolist(), axis=1, inplace=True)

        gc.collect()

        # Fill any remaining N/As
        train = train.fillna(0)
        test = test.fillna(0)

        #print("4.5. Writing to hdf format...\n")
        #train.to_hdf('train_binned_' + str(minbin) + '.h5',key='train',format='fixed',mode='w')
        #test.to_hdf('test_binned_' + str(minbin) + '.h5',key='test',format='fixed',mode='w')

    # Get only top n features
    print("5. Filtering by pickled important columns...\n")
    cols = pd.read_pickle("vars_importance.pkl")
    cols = cols.ix[0:num_features, "var"].tolist()

    print("6. Writing to hdf format...\n")
    #zipcols = ['zip_00xxx', 'zip_0xxx0', 'zip_000xx', 'zip_x00xx', 'zip_x000x', 'zip_xx00x', 'zip_xxx00']
    zipcols = []
    train[cols + zipcols + ['ID', 'target']].to_hdf(trainfilename,
                                                    key='train',
                                                    format='fixed',
                                                    mode='w')
    test[cols + zipcols + ['ID']].to_hdf(testfilename,
                                         key='test',
                                         format='fixed',
                                         mode='w')

    train = train[cols + zipcols]
    test = test[cols + zipcols]

    gc.collect()

    return train.values, labels.values, test.values, test_ids.values
Exemple #26
0

path_var = 'E:\\Wojtek\\_DSCN_\\Analiza_danych\\Leas\\data_set\\'
var_df = pd.read_csv(path_var+'variables.csv', sep=';')
var_df.columns()
var_df.type_var.value_counts()
var_df.head(10)
var_df.type_pred.value_counts(dropna=False)


#predictors = var_df.variable[ (var_df.type_var == 'pred') | (var_df.variable == target_name) ]
predictors = var_df.variable[ var_df.type_var == 'pred' ]
ind_target = data_frame.columns.get_loc(target_name)
type(predictors)
#type(list)(predictors)
pd.match(df.columns, predictors) #zwraca pozycję wystąpienia: R-owy odpowiednik which( vect1 %in% vec2 )


col = df.columns.isin(predictors)+df.columns.isin([target_name])  # to raczej nie jest eleganckie rozwiązanie

target_name = 'TR_D90M12'
df_filtered = df.loc[ (df.FFINRPFH_czy == 1) 
            & (df.TR_ANEKS_RODZAJ_id  == 0) 
            & (df.TR_POZIOM_wykonanie == 1)  
            & (df.PORECZYCIEL_CZY == 0)
            & (pd.isnull(df.TR_FRAUD_DataStatusu)) 
            & (~ pd.isnull(df.loc[:,target_name])), col]
df_filtered.shape

df_filtered2 = df_filtered.dropna(axis = 0, how = 'any')
df_filtered2.shape
Exemple #27
0
def get_dense_specs():
    train = pd.read_csv('../input/train_set.csv', parse_dates=[
        2,
    ])
    test = pd.read_csv('../input/test_set.csv', parse_dates=[
        3,
    ])
    tube = pd.read_csv('../input/tube.csv',
                       true_values=['Y'],
                       false_values=['N'])
    materials = pd.read_csv('../input/bill_of_materials.csv')
    aggs = pd.read_csv('../input/ta_aggs.csv')
    components = pd.read_csv('../input/components.csv')

    train = pd.merge(train, tube, on='tube_assembly_id')
    test = pd.merge(test, tube, on='tube_assembly_id')
    train = pd.merge(train, materials, on='tube_assembly_id')
    test = pd.merge(test, materials, on='tube_assembly_id')

    train = pd.merge(train, aggs, on='tube_assembly_id', how='left')
    test = pd.merge(test, aggs, on='tube_assembly_id', how='left')

    # create some new features
    train['year'] = train.quote_date.dt.year
    train['month'] = train.quote_date.dt.month

    test['year'] = test.quote_date.dt.year
    test['month'] = test.quote_date.dt.month

    train['odd'] = train.quantity % 2
    test['odd'] = test.quantity % 2

    train['div5'] = (train.quantity % 5)
    test['div5'] = (test.quantity % 5)

    train['material_id'].replace(np.nan, ' ', regex=True, inplace=True)
    test['material_id'].replace(np.nan, ' ', regex=True, inplace=True)

    train['bracket_pricing'] = train['bracket_pricing'].replace(['Yes', 'No'],
                                                                [1, 0])
    test['bracket_pricing'] = test['bracket_pricing'].replace(['Yes', 'No'],
                                                              [1, 0])

    fields_to_encode = [
        'supplier', 'material_id', 'end_a', 'end_x', 'end_a_1x', 'end_a_2x',
        'end_x_1x', 'end_x_2x', 'bracket_pricing'
    ]

    for i in range(1, 9):
        column_label = 'component_id_' + str(i)
        fields_to_encode.append(column_label)
        tmp = pd.merge(train,
                       components,
                       left_on=column_label,
                       right_on='component_id',
                       how='left')['component_type_id']
        train[column_label] = tmp
        tmp = pd.merge(test,
                       components,
                       left_on=column_label,
                       right_on='component_id',
                       how='left')['component_type_id']
        test[column_label] = tmp
        train[column_label].replace(np.nan, ' ', regex=True, inplace=True)
        test[column_label].replace(np.nan, ' ', regex=True, inplace=True)

    for j, clf in enumerate(train.columns.tolist()):
        print(j, clf)
    '''    
    # label encode the categorical variables
    for i in fields_to_encode:
        print('Encoding',i)
        lbl = LabelEncoder()
        lbl.fit(list(train.ix[:,i]) + list(test.ix[:,i]))
        train.ix[:,i] = lbl.transform(train.ix[:,i])
        test.ix[:,i] = lbl.transform(test.ix[:,i])

    for i in fields_to_encode:
        print('Encoding',i)
        freqs = train[i].append(test[i]).value_counts()
        train[i] = pd.match(train[i].values, freqs[0:45].index)
        test[i] = pd.match(test[i].values, freqs[0:45].index)
    '''
    for i in fields_to_encode:
        print('Encoding', i)
        rank = pd.concat([train[i], train['cost']],
                         axis=1).groupby(i).mean().sort('cost',
                                                        ascending=False)
        print(rank[0:20])
        train[i] = pd.match(train[i].values, rank[0:45].index)
        test[i] = pd.match(test[i].values, rank[0:45].index)

    train.fillna(0, inplace=True)
    test.fillna(0, inplace=True)

    return train, test
Exemple #28
0
core_variables = [
    'DO_mgL', 'satDO_mgL', 'DOsat_pct', 'WaterTemp_C', 'Depth_m', 'Level_m',
    'Discharge_m3s', 'Light_PAR', 'Light_lux'
]

varcells = []
for x in sitedata.Variables:
    if x is None:
        varcells.append(x)
    else:
        var_arr = np.asarray(x.split(','))
        isCore = np.in1d(var_arr, core_variables)
        core = var_arr[isCore]
        not_core = var_arr[~isCore]
        if any(core):
            core = core[np.argsort(pd.match(core, core_variables))]
        not_core.sort()
        var_arr = ', '.join(np.concatenate((core, not_core)))
        varcells.append(var_arr)

for i in xrange(len(varcells)):
    if varcells[i] is None:
        varcells[i] = '-'

sitedata.Variables = varcells
fr = sitedata['firstRecord'].dt.strftime('%Y-%m-%d')
lr = sitedata['lastRecord'].dt.strftime('%Y-%m-%d')
timerange = fr + ' to ' + lr
sitedata['Coverage'] = timerange.apply(lambda x: x
                                       if x != 'NaT to NaT' else '-')
Exemple #29
0
        index_end = int(np.minimum(
            (args.splitIndex) * np.ceil(float(h5f.shape[0] / 2) / args.splitFold), (h5f.shape[0] / 2)))
    else:
        index_start = 0
        index_end = int(h5f.shape[0] / 2)

    snp_temp = (np.asarray(h5f[index_start:index_end,:])+ np.asarray(h5f[index_start+int(h5f.shape[0]/2):index_end+int(h5f.shape[0]/2),:]))/2.0
    snpEffects.append(snp_temp)


coor = pd.read_csv(args.coorFile,sep='\t',header=None)
coor = coor.iloc[index_start:index_end,:]

#Fetch the distance to TSS information
gene = pd.read_csv(args.geneFile,sep='\t',header=None)
geneinds = pd.match(coor.iloc[:,0].map(str).str.replace('chr','')+' '+coor.iloc[:,1].map(str),
            gene.iloc[:,0].map(str).str.replace('chr','')+' '+gene.iloc[:,2].map(str))
if np.any(geneinds==-1):
    raise ValueError("Gene association file does not match the vcf file.")
if args.fixeddist == 0:
    dist = - np.asarray(gene.iloc[geneinds,-1])
else:
    dist = args.fixeddist
genename = np.asarray(gene.iloc[geneinds,-2])
strand= np.asarray(gene.iloc[geneinds,-3])

#comptue expression effects
snpExpEffects = compute_effects(snpEffects, \
                                dist, strand,\
                                models, maxshift=maxshift, nfeatures=args.nfeatures,
                                batchSize = args.batchSize)
#write output
Exemple #30
0
    train[c + '_m'] = train[c].dt.month
    train[c + '_d'] = train[c].dt.day
    train[c + '_wd'] = train[c].dt.weekday
    train[c + '_hr'] = train[c].dt.hour
    test[c + '_y'] = test[c].dt.year
    test[c + '_m'] = test[c].dt.month
    test[c + '_d'] = test[c].dt.day
    test[c + '_wd'] = test[c].dt.weekday
    test[c + '_hr'] = test[c].dt.hour
train.drop(datecols['col'].values.tolist(), axis=1, inplace=True)

print("categorical variable encoding and cleaning...\n")
for c in train.columns[1:-1]:
    if train[c].dtype.name == 'object':
        freqs = train[c].append(test[c]).value_counts()
        train[c] = pd.match(train[c].values, freqs[0:70].index)
        test[c] = pd.match(test[c].values, freqs[0:70].index)

train = train.fillna(0)
test = test.fillna(0)

labels = train['target']
train.drop(['ID', 'target'], axis=1, inplace=True)
features = train.columns.values

print("filtering by pickled important columns...\n")
vars = pd.read_pickle("vars_importance.pkl")
train = train[vars.ix[0:1250, "var"].tolist()]
test = test[vars.ix[0:1250, "var"].tolist()]

print("converting to numpy array...\n")
Exemple #31
0
def setup(data, trips, sep, cost, factors, constraints, prodCon, attCon, initialParams, Oi, Dj, totalFlows):

    #For doubly constrained model
    if (prodCon == True) & (attCon == True):

        #Variables for constants and deriving them
        data["Bj"] = 1.0
        data["Ai"] = 1.0
        data["OldAi"] = 10.000000000
        data["OldBj"] = 10.000000000
        data["diff"] = abs((data["OldAi"] - data["Ai"])/data["OldAi"])

        #Calc total outflows and inflows
        if Oi:
            print '1'
            data["Oi"] = data[Oi]
        else:
            print '2'
            Oi = data.groupby(data[constraints['production']]).aggregate({trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips]

        if Dj:
            print '3'
            data["Dj"] = data[Dj]
        else:
            print '4'
            Dj = data.groupby(data[constraints['attraction']]).aggregate({trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips]


    #For Production Constrained model
    if (prodCon == True) & (attCon == False):

        #Calc total outflows
        if factors == None:
            print Dj
            if not Dj:
                Dj = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Dj"] = Dj.ix[pd.match(data[totalFlows], Dj.index)].reset_index()[trips].sort_index()

            else:
                data["Dj"] = data[Dj]

        if not Oi:
            Oi = data.groupby(data[constraints['production']]).aggregate({trips: np.sum})
            data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips]
        else:
            data['Oi'] = data[Oi]


    #For Attraction Constrained model
    if (prodCon == False) & (attCon == True):

        #Calc total inflows
        if factors == None:
            if not Oi:
                Oi = data.groupby(data[totalFlows]).aggregate({trips: np.sum})
                data["Oi"] = Oi.ix[pd.match(data[totalFlows], Oi.index)].reset_index()[trips]
            else:
                data["Oi"] = data[Oi]
        if not Dj:
            Dj = data.groupby(data[constraints['attraction']]).aggregate({trips: np.sum})
            data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips]
        else:
            data["Dj"] = data[Dj]


    #For Unconstrained Model
    if (prodCon == False) & (attCon == False):
        pass

    #The following setup is for within all models

    #There is always a beta parameter so set it to user's initial value and add to param list
    print initialParams
    data['beta'] = initialParams['beta']
    params = ['beta']

    #This is the observed data for which we want to derive parameters
    if cost == 'exp':
        knowns = data[sep]
    elif cost == 'pow':
        knowns = np.log(data[sep])
    else:
        sys.exit(sys.exit("The distance/cost function must be either 'pow' or 'exp'."))

    #If there are additional factors we will include that observed data, add it to param list, and add a data vector for the param
    if factors != None:
        if attCon != False:
            for factor in factors['origins']:
                #Include that information in the model
                knowns = knowns+np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                data[str(factor) + 'Param'] = initialParams[factor]
        if prodCon != False:
            for factor in factors['destinations']:
                #Include that informatio in the model
                knowns = knowns+np.log(data[factor])
                #Add to params list
                params.append(str(factor))
                #variable param vector
                print initialParams
                data[str(factor) + 'Param'] = initialParams[factor]

    #Observed information is sum of trips multiplied by the log of known information
    observed = np.sum(data[trips]*knowns)


    #return observed info, data, knownn info, and params list
    return observed, data, knowns, params
import pandas as pd

DATA_DIR = '/scratch/az338/ucc-fileserver/AZ_challenge_data/'

# challenge training data, cell/disease area(DA) mapping and challenge cmp/target mapping
challenge_pairs = pd.read_csv(DATA_DIR+'drug_synergy_data/ch1_train_combination_and_monoTherapy.csv')
cell_da_map = pd.read_csv(DATA_DIR+'sanger_molecular_data/cell_info.csv')
challenge_cmp_target_map = pd.read_csv(DATA_DIR+'drug_synergy_data/Drug_info_release_curated.csv')
challenge_cmp_target_map.columns = ['ChallengeName','Target'] + list(challenge_cmp_target_map.columns[2:])

# map disease area to corresponding cell-line
challenge_pairs['DISEASE_AREA'] = cell_da_map.iloc[pd.match(challenge_pairs['CELL_LINE'], cell_da_map['Sanger.Name'])]['Disease.Area'].tolist()

# make mapping flat (separate targets to different lines)
cmp_target_map = challenge_cmp_target_map[['ChallengeName','Target']]

flat_map = []
for i, r in cmp_target_map.iterrows():
    for t in r['Target'].split(','):
        flat_map.append([r['ChallengeName'],t.rstrip(' ').lstrip(' ')])

flat_map = pd.DataFrame(flat_map).drop_duplicates()
flat_map.columns = ['Compound','Target']

# convert compound-compound associations to target-target associations
synergy_scores = challenge_pairs[['DISEASE_AREA','COMPOUND_A','COMPOUND_B','SYNERGY_SCORE','QA','CELL_LINE']]
target_synergy = []
for i, r in synergy_scores.iterrows():
    #targets_A = flat_map.iloc[pd.match(r['COMPOUND_A'],flat_map['Compound'])]['Target']
    #targets_B = flat_map.iloc[pd.match(r['COMPOUND_B'],flat_map['Compound'])]['Target']
    targets_A = flat_map[flat_map['Compound'] == r['COMPOUND_A']]['Target']
Exemple #33
0



path = os.getenv('HOME')+'/python/phylogeny/pavelMattis/vector_machines/'


for f in [x for x in os.listdir(path+'data/list_length_project/sets/CognateData/output') if x!='.svn']:
    db = f.split('.')[0]
    data = pd.read_table(path+'data/list_length_project/sets/CognateData/output/'+f,encoding='utf-8')
    data = data[['-' not in unicode(x) for x in data.cognate_class.values]]
    output = pd.DataFrame()
    output['ID'] = arange(len(data))+1
    output['Taxon'] = data.language.astype('string')
    output['Gloss'] = data.gloss.values
    output['GlossID'] = pd.match(data.gloss.values,data.gloss.unique())+1
    output['IPA'] = [re.sub(r"[ -]","",unicode(x)) for x in data.transcription]
    output['Tokens'] = [' '.join(asjp2tokens(unicode(w))) for w in output.IPA]
    cClasses = array([x+':'+unicode(y).strip('?')
                      for (x,y) in data[['gloss','cognate_class']].values])
    output['CogID'] = pd.match(cClasses,unique(cClasses))
    output[['Taxon','Gloss']] = output[['Taxon','Gloss']].astype('string')
    output['dbID'] = [db+'_'+str(x-1) for x in output.ID.values]
    output.to_csv('reformattedData/asjp/'+db+'.tsv',encoding='utf-8',
                  sep='\t',index=False)

for f in [x for x in os.listdir(path+'data/list_length_project/sets/mattis_new/output') if x!='.svn']:
    db = f.split('.')[0]
    data = pd.read_table(path+'/data/list_length_project/sets/mattis_new/output/'+f,encoding='utf-8')
    data = data[['-' not in unicode(x) for x in data.cognate_class.values]]
    output = pd.DataFrame()
Exemple #34
0
def _create_pandas_frame(dataset_path, samples_path, targets_path):
    """
	Creates and returns a pandas DataFrame object that includes the dataset's
	samples and targets. Also, the samples are augmented by calculating and
	adding the feature7 column.
	
	Note that the function requires paths as arguments instead of the data
	itself (which is why the temp dir is create in the calling add_feature7).
	This is for reasons that were once reasonable.
	"""
    fname = dataset_path.split('/')[-1]
    db = fname.split('.')[0]
    # read in wordlist
    wordlist = pd.read_table(dataset_path,
                             encoding='utf-8',
                             na_filter=False,
                             dtype=object)
    # keep track of synonyms within the same language
    synDict = defaultdict(lambda: 0)
    synocc = []
    for l, g in wordlist[['language', 'global_id']].values:
        synDict[l, g] += 1
        synocc.append(unicode(synDict[l, g]))
    wordlist['synonym_number'] = synocc
    dDict = {
        'sample_id': unicode,
        'feature1': double,
        'feature2': double,
        'feature3': double,
        'feature4': double,
        'feature5': double,
        'feature6': double,
        'feature8': double
    }
    # read in feature matrix for word pairs
    vectors = pd.read_table(samples_path,
                            encoding='utf-8',
                            na_filter=False,
                            dtype=dDict)
    # read in cognacy judgments
    labels = pd.read_table(targets_path,
                           encoding='utf-8',
                           na_filter=False,
                           dtype={
                               'sample_id': unicode,
                               'target': int
                           })
    # colect metadata for wordpairs in vectors
    metaRaw = array([x.split('/') for x in vectors.sample_id.values])
    meta = pd.DataFrame(c_[metaRaw[:,
                                   0], [x.split(',') for x in metaRaw[:, 1]],
                           [x.split(',') for x in metaRaw[:, 2]]],
                        columns=['global_id', 'l1', 'l2', 'id1', 'id2'])
    meta['sample_id'] = vectors.sample_id
    meta1 = pd.merge(wordlist[[
        'global_id', 'language', 'gloss', 'synonym_number', 'transcription',
        'cognate_class'
    ]],
                     meta,
                     left_on=['global_id', 'language', 'synonym_number'],
                     right_on=['global_id', 'l1', 'id1'])[[
                         'sample_id', 'global_id', 'l1', 'l2', 'transcription',
                         'cognate_class', 'id2'
                     ]]
    meta2 = pd.merge(wordlist[[
        'global_id', 'language', 'gloss', 'synonym_number', 'transcription',
        'cognate_class'
    ]],
                     meta1,
                     left_on=['global_id', 'language', 'synonym_number'],
                     right_on=['global_id', 'l2', 'id2'])[[
                         'sample_id', 'gloss', 'l1', 'transcription_y',
                         'cognate_class_y', 'l2', 'transcription_x',
                         'cognate_class_x'
                     ]]
    meta2.columns = [
        'sample_id', u'gloss', 'l1', u'w1', u'cc1', 'l2', u'w2', u'cc2'
    ]
    meta2 = meta2.ix[pd.match(vectors.sample_id, meta2.sample_id)]
    concepts = meta2.gloss.unique()
    feature7 = pd.Series([
        abs(
            corrcoef(
                array(
                    vectors[meta2.gloss == c][['feature2', 'feature4']].values,
                    double).T)[0, 1]) for c in concepts
    ],
                         index=concepts,
                         dtype=double)
    feature7[feature7.isnull()] = 0
    vectors['feature7'] = feature7.ix[meta2.gloss.values].values
    combined = pd.merge(pd.merge(meta2, vectors, on='sample_id'),
                        labels,
                        on='sample_id')
    combined = combined[combined.columns[1:]]
    combined['db'] = db

    return combined
Exemple #35
0
            '.wt1100.fasta.ref.vcf'
        ],
                   shell=True)  #create .evo1 .evo2 .evo3
        try:
            check_call([
                'python evoevalues.production.py ' + sys.argv[1] +
                '.wt1100.fasta.ref.vcf'
            ],
                       shell=True)  #create .evo.evalues
            dataevoe = pd.read_csv(sys.argv[1] +
                                   '.wt1100.fasta.ref.vcf.evo.evalues',
                                   delimiter=',',
                                   header=None)
            dataevoe[0] = 'chr' + dataevoe[0].astype(str)
            matchedinds = pd.match(
                np.asarray(coordata['chr'].astype(str) +
                           coordata['pos'].astype(str)),
                np.asarray(dataevoe[0].astype(str) + dataevoe[1].astype(str)))
            dataevoe = np.asarray(dataevoe.iloc[:, -4:])
            dataevoe = dataevoe[matchedinds, :]
            #impute evolutionary feature E-values for rare cases that evolutionary features are not available
            dataevoe[matchedinds == -1, :] = np.asarray([1, 1, 1,
                                                         1])[np.newaxis, :]
            datadeepsea = np.exp(
                np.mean(np.log(datae), axis=1) +
                np.mean(np.log(dataevoe), axis=1))
        except:
            datadeepsea = np.exp(np.mean(np.log(datae), axis=1))

        temp = pd.DataFrame(datadeepsea[:, np.newaxis])
        temp.columns = ['Functional significance score']
        datadeepsea = pd.concat([coordata, temp], axis=1)
Exemple #36
0
def testCluster(vdb,
                featureSubset=FEATURES,
                C=0.82,
                gamma=9e-04,
                kernel='linear',
                th=.34):
    """
	Inference on test data.
	"""
    newWordList = pd.DataFrame()
    fitting = trainingVectors
    validation = test[test.db == vdb].copy()
    X = fitting[featureSubset].values
    y = fitting.target.values
    svClf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True)
    svClf.fit(X, y)
    svScores = svClf.predict_proba(validation[featureSubset].values)[:, 1]
    validation['svScores'] = svScores
    scores = pd.DataFrame()
    wordlist = pd.DataFrame()
    concepts = validation.gloss.unique()
    taxa = unique(validation[['l1', 'l2']].values.flatten())
    dataWordlist = vstack([
        validation[['gloss', 'l1', 'w1', 'cc1']].values,
        validation[['gloss', 'l2', 'w2', 'cc2']].values
    ])
    dataWordlist = pd.DataFrame(
        dataWordlist, columns=['concept', 'doculect', 'counterpart', 'cc'])
    dataWordlist = dataWordlist.drop_duplicates()
    dataWordlist.index = [
        '_'.join(map(unicode, x))
        for x in dataWordlist[['concept', 'doculect', 'counterpart']].values
    ]
    validation['id_1'] = [
        c + '_' + l + '_' + unicode(w)
        for (c, l, w) in validation[['gloss', 'l1', 'w1']].values
    ]
    validation['id_2'] = [
        c + '_' + l + '_' + unicode(w)
        for (c, l, w) in validation[['gloss', 'l2', 'w2']].values
    ]
    for c in concepts:
        dataC = validation[validation.gloss == c].copy()
        dataC['id_1'] = [
            x.replace(' ', '').replace(',', '') for x in dataC.id_1
        ]
        dataC['id_2'] = [
            x.replace(' ', '').replace(',', '') for x in dataC.id_2
        ]
        wlC = dataWordlist[dataWordlist.concept == c].copy()
        if len(wlC) > 1:
            wlC.index = [
                x.replace(' ', '').replace(',', '') for x in wlC.index
            ]
            svMtx = zeros((len(wlC.index), len(wlC.index)))
            svMtx[pd.match(dataC.id_1, wlC.index),
                  pd.match(dataC.id_2, wlC.index)] = dataC.svScores.values
            svMtx[pd.match(dataC.id_2, wlC.index),
                  pd.match(dataC.id_1, wlC.index)] = dataC.svScores.values
            svDistMtx = log(1 - svMtx)
            tth = log(th) - svDistMtx.min()
            svDistMtx -= svDistMtx.min()
            fill_diagonal(svDistMtx, 0)
            pDict = infomap_clustering(tth, svDistMtx)
            pArray = vstack(
                [c_[pDict[k], [k] * len(pDict[k])] for k in pDict.keys()])
            partitionIM = pArray[argsort(pArray[:, 0]), 1]
        else:
            partitionIM = array([1])
        wlC['inferredCC'] = [vdb + ':' + c + ':' + str(x) for x in partitionIM]
        wlC['db'] = vdb
        newWordList = pd.concat([newWordList, wlC])
    newWordList.index = arange(len(newWordList))
    return newWordList
from numpy import *
import pandas as pd

# this script computes Cronbach's alpha for all languages in the sample

data = pd.read_csv('conceptwiseSimilarities.csv', index_col=0)

concepts = array(data.columns[-40:])

taxa = unique(data[['language1', 'language2']].values.flatten())

nrMts = []
for c in concepts:
    cMtx = zeros((len(taxa), len(taxa)))
    ix1 = list(pd.match(data.language1, taxa))
    ix2 = list(pd.match(data.language2, taxa))
    cMtx[ix1, ix2] = data[c].values
    cMtx[ix2, ix1] = data[c].values
    nrMts.append(cMtx)

matrices = zeros((len(taxa), len(taxa), len(nrMts)))

for c in xrange(40):
    matrices[:, :, c] = nrMts[c]


def cronbach(x):
    itemwise = sum(apply_along_axis(var, 0, x))
    total = var(apply_along_axis(sum, 1, x))
    return 1. * len(x) / (len(x) - 1) * (1 - itemwise / total)
Exemple #38
0
#https://stackoverflow.com/questions/36063251/python-pandas-how-can-i-group-by-and-assign-an-id-to-all-the-items-in-a-group
df["b"] = LabelEncoder().fit_transform(df['g'])     #int count from 0
#https://stackoverflow.com/questions/41594703/pandas-assign-an-index-to-each-group-identified-by-groupby
df['b'] = pd.Categorical(df['a'].astype(str)).codes
df['b'] = pd.Categorical(df['a'].astype(str) + df['c'].astype(str)).codes #allow multiple col groups



#R: ind = order(v)
y = np.argsort(v)
y = v.argsort()

#R: match(v1, vdict) -> vdict[match(v1,vdict)] gives v1
np.searchsorted(vdict,v1) #if vdict is sorted
vdict[np.searchsorted(vdict,v1)] #gives v1
pd.match([1,2,3,5,8,2],[1,2,4,5,9,2])
match(c(1,2,3,5,8,2),c(1,2,4,5,9,2))

#R: d[order(v),]
d.reindex(np.argsort(d['c'])).reset_index(drop=True)
#R: setcolorder(d,new_col_order)
#https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
#d.reindex_axis(['a','b','c'], axis=1) #deprecated
d.reindex(['a','b','c'], axis=1)  #copy all data

d.sort_values(["a","b"], ascending = [True,False], inplace=False)
d.sort_values("a", ascending = True) #inplace is False default

s.sort_values() #series no need to add input
d['b'] = d.a.sort_values() # error merge to d by index, undoing the sort
d['b'] = d.a.sort_values().values  #correct
for i in range(1,sheet.nrows):
    if sheet.cell_value(i,0) != '':
        date_value = sheet.cell_value(i,0)
        ticker = sheet.cell_value(i,2)
        dt = datetime.datetime(*xlrd.xldate_as_tuple(date_value,book.datemode))
        row_index.append(i)
        dates.append(dt.strftime("%Y%m%d"))
        tickers.append(ticker)

df = pd.DataFrame({'Date':pd.Series(dates, index=row_index),'Ticker':pd.Series(tickers, index=row_index)})
df['Ticker'] = df['Ticker'].apply(lambda x: x.replace('-',' '))
                                  
#Find Unique Tickers and Get Exchange Info from Yahoo!
unique_tickers = pd.unique(df['Ticker']).tolist()
ref_position = pd.match(df['Ticker'].tolist(),unique_tickers).tolist()
unique_tickers = [i.replace(' ','-') for i in unique_tickers]

print('[Reading Exchange Information from Yahoo! Finance]')
exchange_info = []

for i in range(len(unique_tickers)/query_limit):
    print('Reading ' + str((i+1)*query_limit))
    query_url = 'http://download.finance.yahoo.com/d/quotes.csv?s=' + '+'.join(unique_tickers[i*query_limit:(i+1)*query_limit]) + '&f=x'
    if len(exchange_info) == 0:
        exchange_info = pd.read_csv(query_url,header=None).iloc[:,0].tolist()
    else:
        exchange_info.extend(pd.read_csv(query_url,header=None).iloc[:,0].tolist())

if len(unique_tickers)%query_limit > 0:
    print('Reading ' + str(len(unique_tickers)))
Exemple #40
0
        index_end = int(np.minimum(
            (args.splitIndex) * np.ceil(float(h5f.shape[0] / 2) / args.splitFold), (h5f.shape[0] / 2)))
    else:
        index_start = 0
        index_end = int(h5f.shape[0] / 2)

    snp_temp = (np.asarray(h5f[index_start:index_end,:])+ np.asarray(h5f[index_start+int(h5f.shape[0]/2):index_end+int(h5f.shape[0]/2),:]))/2.0
    snpEffects.append(snp_temp)


coor = pd.read_csv(args.coorFile,sep='\t',header=None)
coor = coor.iloc[index_start:index_end,:]

#Fetch the distance to TSS information
gene = pd.read_csv(args.geneFile,sep='\t',header=None)
geneinds = pd.match(coor.iloc[:,0].map(str).str.replace('chr','')+' '+coor.iloc[:,1].map(str),
            gene.iloc[:,0].map(str).str.replace('chr','')+' '+gene.iloc[:,2].map(str))
if np.any(geneinds==-1):
    raise ValueError("Gene association file does not match the vcf file.")
if args.fixeddist == 0:
    dist = - np.asarray(gene.iloc[geneinds,-1])
else:
    dist = args.fixeddist
genename = np.asarray(gene.iloc[geneinds,-2])
strand= np.asarray(gene.iloc[geneinds,-3])

#comptue expression effects
snpExpEffects = compute_effects(snpEffects, \
                                dist, strand,\
                                models, maxshift=maxshift, nfeatures=args.nfeatures,
                                batchSize = args.batchSize)
#write output
Exemple #41
0
 def time_match_strings(self):
     pd.match(self.all, self.uniques)
Exemple #42
0
 def time_match_strings(self):
     pd.match(self.all, self.uniques)
  
 fl = pd.read_csv(reportPath + fl)
 fl = fl.replace(np.nan,0).replace('',0)
 report = fl.reset_index(drop = True)
  
 climb = climb.replace(np.nan, 0).replace('',0)
 ## apply designated filters:
 climb = climb[ climb.vols1825 <= 0.7]
 climb = climb[ climb.rollingSTD730 <= 0.2]
  
 #climb = climb[ climb['skew'] > 0]
 #climb = climb[ climb.avgRectractAsGainPercent_180 <= 80]
 climb = pd.merge(climb,coins, on = 'ticker')
  
 for periodCol in ['180']:
     climb['change'] = ~(climb['climbOrRetract_' + periodCol] == report['climbOrRetract_' + periodCol][pd.match(climb.ticker,report.ticker)].values)
     changeReport = climb[['ticker','latestPrice','climbOrRetract_' + periodCol,'avgRectractAsGainPercent_' + periodCol,'currentAsPercentOfPrevious_' + periodCol,'targetFib_'+ periodCol,
                              'target_'+ periodCol,'targetGain_' + periodCol,'avgDaysClimbing_' + periodCol,'avgDaysRetracting_' + periodCol,
                              'gainFromMin_1_2','gainFromMin_2_4','gainFromMin_5_7','change']]
     changeReport = changeReport[ changeReport.change == True].reset_index(drop = True)
      
     if changeReport.shape[0] > 0:
         changeReport.to_csv('C:\\Users\\Nick\\Documents\\project MONEY\\Output Reports\\crypto\\daily\\changeReport_daily' + periodCol + '_' + dte + '.csv', sep =',', index = False)
          
         splitLen = int(np.ceil(changeReport.ticker.unique().shape[0] / 3))
          
         msgImageData = []
            
         n = 0
         for i in range(splitLen):
             if n < changeReport.ticker.unique().shape[0]:
                  ]]
 meta2 = pd.merge(wordlist[[
     'global_id', 'language', 'gloss', 'synonym_number', 'transcription',
     'cognate_class'
 ]],
                  meta1,
                  left_on=['global_id', 'language', 'synonym_number'],
                  right_on=['global_id', 'l2', 'id2'])[[
                      'sample_id', 'gloss', 'l1', 'transcription_y',
                      'cognate_class_y', 'l2', 'transcription_x',
                      'cognate_class_x'
                  ]]
 meta2.columns = [
     'sample_id', u'gloss', 'l1', u'w1', u'cc1', 'l2', u'w2', u'cc2'
 ]
 meta2 = meta2.ix[pd.match(vectors.sample_id, meta2.sample_id)]
 concepts = meta2.gloss.unique()
 feature7 = pd.Series([
     abs(
         corrcoef(
             array(
                 vectors[meta2.gloss == c][['feature2', 'feature4']].values,
                 double).T)[0, 1]) for c in concepts
 ],
                      index=concepts,
                      dtype=double)
 feature7[feature7.isnull()] = 0
 vectors['feature7'] = feature7.ix[meta2.gloss.values].values
 combined = pd.merge(pd.merge(meta2, vectors, on='sample_id'),
                     labels,
                     on='sample_id')