Example #1
0
    def test_to_csv_with_dst_transitions(self):

        with ensure_clean("csv_date_format_with_dst") as path:
            # make sure we are not failing on transitions
            times = pd.date_range(
                "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H", ambiguous="infer"
            )

            for i in [times, times + pd.Timedelta("10s")]:
                time_range = np.array(range(len(i)), dtype="int64")
                df = DataFrame({"A": time_range}, index=i)
                df.to_csv(path, index=True)

                # we have to reconvert the index as we
                # don't parse the tz's
                result = read_csv(path, index_col=0)
                result.index = pd.to_datetime(result.index).tz_localize("UTC").tz_convert("Europe/London")
                assert_frame_equal(result, df)

        # GH11619
        idx = pd.date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris")
        df = DataFrame({"values": 1, "idx": idx}, index=idx)
        with ensure_clean("csv_date_format_with_dst") as path:
            df.to_csv(path, index=True)
            result = read_csv(path, index_col=0)
            result.index = pd.to_datetime(result.index).tz_localize("UTC").tz_convert("Europe/Paris")
            result["idx"] = pd.to_datetime(result["idx"]).astype("datetime64[ns, Europe/Paris]")
            assert_frame_equal(result, df)

        # assert working
        df.astype(str)

        with ensure_clean("csv_date_format_with_dst") as path:
            df.to_pickle(path)
            result = pd.read_pickle(path)
            assert_frame_equal(result, df)
def preeditimage(input_file, output_dir, params):
    """
    Segment the specified grayscale images, and save the binary image to file.
    First, clean the image by removing the background and filtering it, then 
    find the edges and threshold it to convert it to a binary image. Extract 
    and verify the data from this image.

    args:
        input_file (file): input directory of raw data
        output_dir (path): output directory to save file
        params (dict): input parameters

    """

    # Do not overwrite existing output
    output_file = os.path.join(output_dir, os.path.basename(input_file))
    if os.path.isfile(output_file):
        img = imread(output_file)
    else:
        # Segment the grayscale image and save to file
        img = segment.main(imread(input_file), params["segment"])
        imsave(output_file, img)

    print " - segment: " + time.asctime()

    # Do not overwrite existing output
    output_file2 = os.path.splitext(output_file)[0] + ".pickle"
    if os.path.isfile(output_file2):
        return

    # Extract properties from the labeled image and save as a DataFrame
    data = extract.preedit(img, params["extract"])
    columns = (
        "Area",
        "BoundingBox",
        "Centroid",
        "EdgeSpline",
        "FourierFit",
        "Length",
        "MidSpline",
        "Perimeter",
        "StalkedPole",
        "SwarmerPole",
    )

    f = read.getframenum(input_file, params["segment"]["pattern"])
    if data:
        # Make MultiIndex with frame and label info
        j = [f] * len(data)
        k = [v["Label"] for v in data]
    else:
        # Create empty DataFrame
        data = [dict.fromkeys(columns, np.nan)]
        j = [f]
        k = [-1]
    index = MultiIndex.from_arrays((j, k), names=("Frame", "Label"))
    df = DataFrame(data, columns=columns, index=index)
    verify.preedit(df, params["verify"])
    df.to_pickle(output_file2)

    print " - extract: " + time.asctime()
df.to_pickle("savedFrames/predictionFeatures/paperTable")


print "Constructing Review Table"
i = 0
reviewTable = []

for id, review in loader.reviews.iteritems():
    paper = review.paper
    reviewer = review.user

    reviewTable.append(
        {
            "paperId": paper.id,
            "userId": reviewer.id,
            "rating": review.overallRating,
            "revPaperCount": len(reviewer.pastPapers),
            "revTopPaperCount": reviewer.topPastPapers,
            "revKDDPaperCount": reviewer.topKDDPast,
            ###Similarity
            "maxSimilarity": calcFeatures.getAuthorReviewerSimilarity(tfidf, paper.maxAuthor, reviewer),
            "primarySimilarity": calcFeatures.getAuthorReviewerSimilarity(tfidf, paper.primaryAuthor, reviewer),
            "maxJacSimilarity": calcFeatures.getAuthorReviewerSimilarity(
                tfidf, paper.maxAuthor, reviewer, jaccard=True
            ),
            "primaryJacSimilarity": calcFeatures.getAuthorReviewerSimilarity(
                tfidf, paper.primaryAuthor, reviewer, jaccard=True
            ),
            ###Co-Author Distance
            "minDist": review.minDist,
            "avgDist": review.avgDist,
            "revCountry": reviewer.country,
        }
    )

    i += 1
    if i % 250 == 0:
        print "(%d/%d) Reviews Completed" % (i, len(loader.reviews))

df = DataFrame(reviewTable)

toLog = ["revPaperCount", "revTopPaperCount", "revKDDPaperCount"]

for i in toLog:
    il = i + "Log"
    df[il] = df[i]
    df.loc[df[il] == 0, il] = 0.1
    df.loc[:, il] = np.log(df.loc[:, il])

df.to_pickle("savedFrames/predictionFeatures/reviewTable")
                game.predCoords = game.initPredCoords = [(0, 0), (10, 10), (0, 10)]
            elif nPreds == 4:
                game.predCoords = game.initPredCoords = [(0, 0), (10, 10), (0, 10), (10, 0)]
            results[nPreds], avgRMS, randomReturnValues[nPreds] = getResults(
                samples, episodes, discount, epsilon, alpha, initValue, softmax
            )
            winRatioDict[nPreds] = randomReturnValues[nPreds]["winratio"]
    else:
        sys.exit()

    results["episode"] = range(1, episodes + 1)
    winRatioDict["episode"] = range(1, episodes + 1)

    dataF_steps = DataFrame(results)
    dataF_steps.to_pickle("data/Sarsa_steps" + str(samples) + str(episodes) + category + str(softmax))

    dataF_winratio = DataFrame(winRatioDict)
    dataF_winratio.to_pickle("data/Sarsa_winratio" + str(samples) + str(episodes) + category + str(softmax))
else:
    dataF_steps = pd.read_pickle("data/Sarsa_steps" + str(samples) + str(episodes) + category + str(softmax))
    dataF_winratio = pd.read_pickle("data/Sarsa_winratio" + str(samples) + str(episodes) + category + str(softmax))

if graphtype == "steps":
    dataToPlot = dataF_steps
    ylabel = "Steps"
elif graphtype == "winratio":
    dataToPlot = dataF_winratio
    ylabel = "Win Ratio"

if smoothing:
    for par in parametersFor(category):
        dataToPlot[par] = scipy.ndimage.filters.gaussian_filter(dataToPlot[par], 5 * (episodes / 4000), 0)
Example #5
0
    notused, rmse["Q-learning"] = qlearningresults(
        samples, episodes, discount, epsilon, alpha, initValue, softmax, theta
    )
    notused, rmse["Q-learning with SoftMax"] = qlearningsoftmaxresults(
        samples, episodes, discount, tau, alpha, initValue, softmax, theta
    )
    notused, notused, rmse["On Policy Monte Carlo"] = montecarloOnPolicyresults(
        samples, episodes, discount, epsilon, 0, theta
    )
    notused, notused, notused, rmse["Off Policy Monte Carlo"] = montecarloOffPolicyresults(
        samples, episodes, discount, epsilon, 0, theta
    )

    rmse["episode"] = range(0, episodes)
    dataF = DataFrame(rmse)
    dataF.to_pickle("data/rmse" + str(episodes))
else:
    dataF = pd.read_pickle("data/rmse" + str(episodes))

episodeData = pd.melt(dataF, id_vars=["episode"], var_name="Learning algorithm")
# for key, value in rmse.items():
#     plt.figure()
#     plt.plot(value, 'b')
#     plt.xlabel('Episodes')
#     plt.ylabel('Root Mean Square Error ('+key+')')
#     plt.legend()

# plt.show()

p = (
    ggplot(episodeData, aes("episode", "value", color="Learning algorithm"))
                max_update_step=np.max(np.abs(f_phi_updates())),
                l_rate=l_rate,
            )
            # save the intensity plot:
            E_out = f_E_out()
            E2_out = f_E2_out()
            ax.imshow(E2_out[300:400, 300:400], vmin=0, vmax=1, **plot_args)
            ax.set_title("Intensity")
            ax2.imshow(E_out[0][300:400, 300:400], vmin=-1, vmax=1, **plot_args)
            ax2.set_title("Re(E)")
            fig_name = os.path.join(plotdir, "{n:06d}.png".format(n=n))
            plt.savefig(fig_name)

        if n % update_frequency == 0:
            # also renormalise the update rate:
            phi_rate_avg = np.mean(np.abs(f_phi_updates()))
            l_rate = np.min([update_rate_target / phi_rate_avg, 1.2 * l_rate])  # can go up by 20% at the most.
            updates = (
                (slmOpt.phi, slmOpt.phi - l_rate * slmOpt.phi_rate),
                (slmOpt.phi_rate, momentum * slmOpt.phi_rate + (1.0 - momentum) * grad),
            )
            update = theano.function([], cost, updates=updates, on_unused_input="warn")

    print "Finished gradient descent, saving summary."
    # create and save the dataframe with the learning curves:
    df = DataFrame(
        {"Cost_SE": l_cost_SE, "Cost_QE": l_cost_QE, "Mean_update": l_mean_update, "Max_update": l_max_update}
    )
    df.to_pickle(os.path.join(outputdir, "summary.pkl"))

    sys.exit()
        # read descendant data
        IR = elt.find("Item_Response")
        el_data["Item_Response"] = IR.text.rstrip()

        for child in elt.getiterator("Item_DataPoint_Score_Details"):
            el_data["Final_Score"] = child.get("Final_Score")

        for child in elt.getiterator("Score"):
            read = child.get("Read_Number")
            el_data["Read" + read + "_Date"] = child.get("Date_Time")
            el_data["Read" + read + "_ID"] = child.get("Reader_ID")
            el_data["Read" + read + "_Score"] = child.get("Score_Value")
            el_data["Read" + read + "_Cond"] = child.get("Condition_Code")

        for child in elt.getiterator("Item_Alert"):
            el_data["Alert_Code"] = child.get("Alert_Code")
            el_data["Alert_ReaderID"] = child.get("Alert_ReaderID")

        # add element data to the file-level data
        data.append(el_data)

    # convert into a DataFrame then save as pickle, csv, then move XML to processed directory.
    fileframe = DataFrame(data, columns=columns)
    fileframe.to_pickle(
        os.path.join(pickledir, os.path.splitext(xfile)[0] + ".pickle")
    )  # pickle files useful for further python processing
    fileframe.to_csv(
        os.path.join(tsvdir, os.path.splitext(xfile)[0] + ".tsv"), sep="\t"
    )  # tsv filese to be used for R scripts
    shutil.move(os.path.join(frompems, xfile), xmldir)
Example #8
0
    if category == "epsilon":
        for epsilon in [0.05, 0.1, 0.3, 0.9]:
            results[epsilon], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
    elif category == "alpha":
        for alpha in [0.1, 0.2, 0.3, 0.6, 1]:
            results[alpha], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
    elif category == "discount":
        for discount in [0.1, 0.4, 0.7, 0.8, 0.9]:
            print discount
            results[discount], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
    else:
        sys.exit()
    print results
    results["episode"] = range(0, episodes)
    dataF = DataFrame(results)
    dataF.to_pickle("data/" + category + str(softmax))
else:
    dataF = pd.read_pickle("data/" + category + str(softmax))

episodeData = pd.melt(dataF, id_vars=["episode"], var_name=category)


p = (
    ggplot(episodeData, aes("episode", "value", color=category))
    + geom_line()
    + theme_bw()
    + theme()
    + ylab("Steps")
    + xlab("Episodes")
    + ylim(0, 60)
)
Example #9
0
class LearnObject:
    def __init__(self, FeatureObject, LabelsObject, LabelsObject2="notDefined"):
        self.FeaturesDF = FeatureObject.FeaturesDF
        self.LabelsObject = LabelsObject
        self.LabelsObject2 = LabelsObject2
        self.Details = {
            "LabelDetails": LabelsObject.LabelingDetails,
            "stratifiedKFold": FeatureObject.details,
            "FeatureMethod": FeatureObject.method,
            "PieceLength": FeatureObject.details["PieceLength"],
        }
        self.BestFeatures = {}
        self.N = LabelsObject.N
        self.model = "notDefined"

    class BestFeaturesForLabel:  # class of the best features for certain Labeling method (PatientsVsContols, mentalStatus, PANSS, etc.)
        def __init__(self, FeatureTypeList, LabelingList, n_features):
            self.df = DF(
                np.zeros([len(FeatureTypeList), n_features]),
                index=MultiIndex.from_tuples(FeatureTypeList),
                columns=range(n_features),
            )

        def add(self, bestNfeatures):  # adds a feature to best features list (length n_features)
            BestFeaturesList = [j for j in bestNfeatures]
            FeatureTypeList = self.df.index
            for feature in FeatureTypeList:
                if feature in BestFeaturesList:
                    isFeature = 1
                    FeatureLoc = BestFeaturesList.index(feature)
                    self.df.loc[feature][FeatureLoc] += 1

    """def analyzeFeaturesWeight(BestFeaturesDF,weights,ByLevel=0): #after having n features, this analyzes the wheighted mean of the use in each feature type. 
        df=BestFeaturesDF 
        #N=df.sum().sum()
        dfSum=df.sum(level=ByLevel)
        self.Mean=dfSum.sum(axis=1)
            
        weights=self.weights#[1.0/(x+1) for x in df.columns]            
        wSum=dfSum.mul(weights)
        wN=wSum.sum().sum()
        self.WeightedMean=wSum.sum(axis=1)/wN
        return WeightedMean"""

    # TODO -> add analysis according to facial part (according to excel..)
    # TODO - > add analysis according to learning weights (and not 0.1 : 0.9)

    def run(
        self,
        Model="ridge",
        kernel="linear",
        cross_validationMethod="KFold",
        FeatureSelection="PCA",
        n_features=20,
        scoringList=["specificity", "sensitivity", "precision", "f1", "accuracy", "ss_mean"],
        isSaveCsv=None,
        isSavePickle=None,
        isSaveFig=None,
        isPerm=0,
        isBetweenSubjects=True,
        isConcatTwoLabels=False,
    ):
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel)
        # --  # select best model (svm, otherwise ridge regression)
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html)
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test)
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init
        FeatureTypeList = [j for j in tuple(self.FeaturesDF.index)]
        self.FullResults = DF()
        self.Learningdetails = {
            "Model": Model,
            "Kernel": kernel,
            "CrossVal": cross_validationMethod,
            "FeatureSelection": FeatureSelection,
            "LabelBy": self.Details["LabelDetails"].keys()[0],
            "FeatureMethod": self.Details["FeatureMethod"],
            "PieceLength": self.Details["PieceLength"],
        }
        print("\n------------Learning Details------------")
        print(DF.from_dict(self.Learningdetails, orient="index"))
        print("\n----" + cross_validationMethod + " Cross validation Results:----")

        # Set learning params (cross validation method, and model for learning)
        isBoolLabel = self.LabelsObject.isBoolLabel
        isBoolScores = isBoolLabel
        model, isBoolModel, featureSelectionMethod, selectFeaturesFunction = learningUtils.setModel(
            Model, FeatureSelection, n_features
        )
        # define global variables over modules (to be used in myUtils)
        globalVars.transformMargins = 0  # lambda x:x
        globalVars.isBoolLabel = isBoolLabel
        globalVars.isBoolModel = isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels, isAddDroppedSubjects
        trainLabels_all, testLabels_all, TrueLabels, isAddDroppedSubjects = labelUtils.initTrainTestLabels_all(
            self.LabelsObject
        )
        trainLabels_all2, testLabels_all2, TrueLabels2, isAddDroppedSubjects2 = labelUtils.initTrainTestLabels_all(
            self.LabelsObject2
        )

        LabelingList = ["N1"]  # trainLabels_all.columns
        self.ResultsDF = DF()
        self.BestFeatures = DF(columns=LabelingList)  # dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels = pandas.Panel(
            items=range(len(trainLabels_all)), major_axis=LabelingList, minor_axis=TrueLabels.index
        )  # panel: items=cv_ind, major=labels, minor=#TODO

        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList.
        for label_ind, Labeling in enumerate(LabelingList):
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            # set subjects list according to labels and features
            X, SubjectsList, droppedSubjects, Xdropped = featuresUtils.initX(self.FeaturesDF, trainLabels_all, Labeling)
            X2, SubjectsList2, droppedSubjects2, Xdropped2 = featuresUtils.initX(
                self.FeaturesDF, trainLabels_all2, Labeling, is2=1
            )

            # init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(
                Labeling, SubjectsList, trainLabels_all, testLabels_all
            )
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(
                Labeling, SubjectsList2, trainLabels_all2, testLabels_all2
            )

            # make sure only labeled subjects are used for classification
            X = X.query("subject == " + str(list(trainLabels.index)))
            X.index.get_level_values(X.index.names[0])
            SubjectIndex = list(set(X.index.get_level_values("subject")))

            X2 = X2.query("subject == " + str(list(trainLabels2.index)))
            X2.index.get_level_values(X2.index.names[0])
            SubjectIndex2 = list(set(X2.index.get_level_values("subject")))
            # init vars
            if isBetweenSubjects:
                cv_param = len(SubjectIndex)
                self.Learningdetails["CrossValSubjects"] = "between"
                isWithinSubjects = False
            else:
                isWithinSubjects = True
                X = X.swaplevel(0, 1)
                PieceIndex = list(set(X.index.get_level_values("Piece_ind")))
                cv_param = len(PieceIndex)
                self.Learningdetails["CrossValSubjects"] = "within"

            self.Learningdetails["NumOfFeatures"] = n_features

            print("\n**" + Labeling + "**")

            cv, crossValScores = learningUtils.setCrossValidation(
                cross_validationMethod, cv_param, trainLabels, isWithinSubjects
            )

            ## Learning - feature selection for different scoring types, with cross validation -

            BestFeaturesForLabel = self.BestFeaturesForLabel(
                FeatureTypeList, LabelingList, n_features
            )  # saves dataframe with best features for each label, for later analysis
            cv_ind = 0
            # used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores = DF()
            test_index = X.index
            testScores = concat([DF(index=test_index), DF(index=["std_train_err"])])
            testScores2 = concat([DF(index=testLabels.index), DF(index=["std_train_err"])])
            # impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange = LabelRange

            ModelWeights1 = DF(columns=range(len(cv)), index=X.columns)
            Components = pandas.Panel(
                items=range(len(cv)), major_axis=X.columns, minor_axis=range(n_features)
            )  # todo fix this for 1st and second learning
            ExplainedVar = DF(columns=range(len(cv)))
            ModelWeights2 = DF(columns=range(len(cv)))
            for train, test in cv:

                if isBetweenSubjects:
                    # set X and Y
                    train_subjects = trainLabels.iloc[train].index
                    test_subjects = testLabels.iloc[test].index
                    Xtrain, Xtest, Ytrain, YtrainTrue, Ytest = learningUtils.setXYTrainXYTest(
                        X, Labeling, trainLabels, testLabels, TrueLabels, train_subjects, test_subjects
                    )
                    Xtrain2, Xtest2, Ytrain2, YtrainTrue2, Ytest2 = learningUtils.setXYTrainXYTest(
                        X2, Labeling, trainLabels2, testLabels2, TrueLabels2, train_subjects, test_subjects
                    )

                    if isConcatTwoLabels:  # used when there is more than one doctor
                        Xtrain = concat([Xtrain, Xtrain2])
                        Xtest = concat([Xtest, Xtest2])
                        Ytrain = concat([Ytrain, Ytrain2])
                        YtrainTrue = concat([YtrainTrue, YtrainTrue2])
                        Ytest = concat([Ytest, Ytest2])
                        Xdropped = concat([Xdropped, Xdropped2])
                        SubjectsList = list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects = list(
                            set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList))
                        )  # diff from SubjectsList to make sure no subjects are both in train and test.
                    """else:
                        Xtrain=Xtrain1
                        Xtest=Xtest1
                        Xdropped=Xdropped1
                        Ytrain=Ytrain1
                        YtrainTrue=YtrainTrue1
                        Ytest=Ytest1"""

                    # select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar, decomposeFunc = learningUtils.selectBestNfeatures(
                        Xtrain, Xtest, Ytrain, n_features, selectFeaturesFunction
                    )
                    BestFeaturesForLabel.add(bestNfeatures)  # todo - delete this??

                    # train 1
                    TrainModel = model
                    TrainModel.fit(Xtrain.sort_index(), Ytrain.T.sort_index())
                    try:
                        Components[cv_ind] = components.T
                        ExplainedVar[cv_ind] = explainedVar
                        isDecompose = True
                        if cv_ind == 0:
                            ModelWeights1 = DF(columns=range(len(cv)), index=range(len(bestNfeatures)))
                        ModelWeights1[cv_ind] = TrainModel.coef_.flatten()
                    except AttributeError:
                        isDecompose = False
                        ModelWeights1[cv_ind].loc[bestNfeatures] = TrainModel.coef_.flatten()
                    self.isDecompose = isDecompose
                    # train 2
                    if isBoolLabel:
                        PiecePrediction_train = DF(
                            TrainModel.predict(Xtrain), index=Xtrain.index, columns=["prediction"]
                        )
                        TrainModel2 = svm.SVC(kernel="linear", probability=True, class_weight={0: 1, 1: 1})
                    else:
                        PiecePrediction_train = DF(
                            TrainModel.decision_function(Xtrain), index=Xtrain.index, columns=["prediction"]
                        )
                        TrainModel2 = linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2 = learningUtils.getX2Y2(
                        Xtrain, Ytrain, YtrainTrue, PiecePrediction_train, isBoolLabel
                    )
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind == 0:
                        ModelWeights2 = DF(columns=range(len(cv)), index=Xtrain2.columns)
                    ModelWeights2[cv_ind] = TrainModel2.coef_.flatten()

                    # test 1
                    if (
                        isAddDroppedSubjects
                    ):  # take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped = DF(decomposeFunc(Xdropped).values, index=Xdropped.index)
                        XtestDropped = dXdropped[bestNfeatures]
                        YtestDropped = Series(XtestDropped.copy().icol(0))
                        # YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject] = testLabels_all[Labeling].loc[subject]
                            # YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest = concat([Ytest, YtestDropped]).sort_index()
                        Xtest = concat([Xtest, XtestDropped]).sort_index()

                    if isPerm:  # TODO- Check this!!
                        Ytest = y_perms.loc[Ytest.index]
                    Xtest = Xtest.fillna(0.0)

                elif isWithinSubjects:
                    # train 1
                    train_pieces = PieceIndex[train]
                    test_pieces = PieceIndex[
                        test
                    ]  # TODO - make sure that if test/train> piece index, it ignores it and repeate the process

                    XtrainAllFeatures = X.query("Piece_ind == " + str(list(train_pieces)))
                    Ytrain = Series(index=X.index)
                    Ytest = Series(index=X.index)
                    YtrainTrue = Series(index=X.index)

                    for subject in PieceIndex:
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject] = trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject] = TrueLabels[Labeling].loc[subject]
                            Ytest.loc[piece].loc[subject] = testLabels[subject]
                    Ytrain = Ytrain.dropna()
                    YtrainTrue = YtrainTrue.dropna()
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject] = testLabels[subject]
                # train scores 1
                if cv_ind == 0:
                    trainScores, YtrainPredicted = learningUtils.getTrainScores(Ytrain, Xtrain, YtrainTrue, TrainModel)
                    plt.figure(1)
                    if len(LabelingList) > 1:
                        plt.subplot(round(len(LabelingList) / 2), 2, label_ind + 1)
                    if isBoolLabel:
                        testScores = learningUtils.getTestScores(Ytest, Xtest, TrainModel)
                    else:
                        testScores[cv_ind] = learningUtils.getTestScores(Ytest, Xtest, TrainModel)
                        plt.title(Labeling, fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores, YtrainPredicted = learningUtils.getTrainScores(
                        Ytrain, Xtrain, YtrainTrue, TrainModel
                    )
                    trainScores = concat([trainScores, new_trainScores], axis=1)
                    # test 1
                    testScores[cv_ind] = learningUtils.getTestScores(Ytest, Xtest, TrainModel)

                # train2

                if isBoolLabel:
                    PiecePrediction_test = DF(TrainModel.predict(Xtest), index=Xtest.index, columns=["prediction"])
                else:
                    PiecePrediction_test = DF(
                        TrainModel.decision_function(Xtest), index=Xtest.index, columns=["prediction"]
                    )
                Xtest2, Ytest2, YtestTrue2 = learningUtils.getX2Y2(
                    Xtest, Ytest, Ytest, PiecePrediction_test, isBoolLabel
                )

                if cv_ind == 0:
                    trainScores2, YtrainPredicted2 = learningUtils.getTrainScores(
                        Ytrain2, Xtrain2, YtrainTrue2, TrainModel2
                    )
                    YpredictedOverAllLabels[cv_ind].loc[Labeling] = YtrainPredicted2
                    # plt.figure(1)
                    # if len(LabelingList)>1:
                    # plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    # test2
                    if isBoolLabel:
                        testScores2 = learningUtils.getTestScores(Ytest2, Xtest2, TrainModel2)
                    else:
                        testScores2[cv_ind] = learningUtils.getTestScores(Ytest2, Xtest2, TrainModel2)
                    # plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2, YtrainPredicted2 = learningUtils.getTrainScores(
                        Ytrain2, Xtrain2, YtrainTrue2, TrainModel2
                    )
                    YpredictedOverAllLabels[cv_ind].loc[Labeling] = YtrainPredicted2
                    trainScores2 = concat([trainScores2, new_trainScores2], axis=1)
                    testScores2[cv_ind] = learningUtils.getTestScores(Ytest2, Xtest2, TrainModel2)
                cv_ind += 1

                # crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data.
            fig2 = plt.figure(2)
            if len(LabelingList) > 1:
                plt.subplot(round(len(LabelingList) / 2), 2, label_ind + 1)
            # if isAddDroppedSubjects:
            # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
            # else:
            # testLabelsSummary=testLabels
            scoresSummary = learningUtils.getScoresSummary(trainScores2, testScores2, TrueLabels[Labeling])
            # reset global vars
            globalVars.fitYscale = "notDefined"
            globalVars.beta = DF()

            plt.title(Labeling, fontsize=10)
            plt.xlabel("Ytrue", fontsize=8)
            plt.ylabel("Ypredicted", fontsize=8)
            plt.tick_params(labelsize=6)
            # print(crossValScores.T)
            scores = scoresSummary.fillna(0.0)

            # analyze feature weightsL

            WeightedFeatures1 = DF(
                [ModelWeights1.mean(axis=1), ModelWeights1.std(axis=1)], index=["mean", "std"]
            ).T.fillna(0)
            if isDecompose == 0:
                WeightedFeatures1FeatureType = WeightedFeatures1.mean(level="FeatureType")
                WeightedFeatures1FsSingal = WeightedFeatures1.mean(level="fs-signal")
                WeightedFeatures1 = concat(
                    [
                        DF(index=["-------(A) FeatureType-------"]),
                        WeightedFeatures1FeatureType,
                        DF(index=["-------(B) faceshift signal-------"]),
                        WeightedFeatures1FsSingal,
                    ]
                )

            WeightedFeatures2 = DF(
                [ModelWeights2.mean(axis=1), ModelWeights2.std(axis=1)], index=["mean", "std"]
            ).T.fillna(0)
            BestFeatures = concat(
                [
                    DF(index=["------------- Learning 1 -------------"]),
                    WeightedFeatures1,
                    DF(index=["------------- Learning 2 -------------"]),
                    WeightedFeatures2,
                ]
            )
            self.BestFeatures[Labeling] = BestFeatures["mean"]

            # analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T  # todo- check!
                ExplainedVar_mean.index = ["ExplainedVar_mean"]
                ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T  # todo- check!
                ExplainedVar_std.index = ["ExplainedVar_std"]
                try:
                    self.LabelComponents[Labeling] = concat(
                        [
                            DF(index=["---components mean---"]),
                            Components_mean,
                            ExplainedVar_mean,
                            DF(index=["---components std over cross validation---"]),
                            Components_std,
                            ExplainedVar_std,
                        ]
                    )
                except AttributeError:
                    self.LabelComponents = dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling] = concat(
                        [
                            DF(index=["---components mean---"]),
                            Components_mean,
                            ExplainedVar_mean,
                            DF(index=["---components std over cross validation---"]),
                            Components_std,
                            ExplainedVar_std,
                        ]
                    )

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

            # BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults = concat([DF(index=[Labeling]), scores])

            self.FullResults = concat([self.FullResults, LabelFullResults])
            self.ResultsDF = concat([self.ResultsDF, DF(scores[0], columns=[Labeling])], axis=1)
        # continue here!! to build pseudo inverse matrix from predicted to true - make sure columns + rows are set!

        # self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

        # plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3 = pandas.Panel(items=range(len(X2.index)))  # for each cv score...
        FullSubjectsList = YpredictedOverAllLabels[0].columns
        YdroppNans = YpredictedOverAllLabels.dropna(axis=0, how="all")
        YdroppNans = YdroppNans.dropna(axis=1, how="all")
        YpredictedOverAllLabels = YdroppNans.dropna(axis=2, how="all")
        notNans_cv_ind = YpredictedOverAllLabels.items
        notNans_trainSubjects = YpredictedOverAllLabels.minor_axis
        notNans_LabelsList = YpredictedOverAllLabels.major_axis
        notNans_TrueLabels = TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind = 0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train = list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test = list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train) > 0 and len(test) > 0:
                    AllLabelsYTrainPredicted = YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted = AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue = notNans_TrueLabels[train]
                    AllLabelsYTestPredicted = YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue = notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue = DF(
                        np.linalg.pinv(AllLabelsYTrainTrue),
                        columns=AllLabelsYTrainTrue.index,
                        index=AllLabelsYTrainTrue.columns,
                    )
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix = DF(
                        AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),
                        columns=pseudoInverse_AllLabelsYTrainTrue.columns,
                    )  # change to real code!!
                TrainModel3 = lambda y: y.T.dot(AllLabelsTransformationMatrix)
                testscores3[cv_ind] = learningUtils.getTestScores(
                    AllLabelsYTrainTrue, AllLabelsYTrainPredicted, TrainModel3
                )
            cv_ind += 1

        self.ResultsDF = self.ResultsDF.fillna(0.0)

        ## Print and save results
        print("\n")
        print(self.ResultsDF)
        print("\n")
        D = self.Learningdetails
        savePath = (
            resultsPath
            + "\\"
            + D["Model"]
            + "_"
            + D["CrossVal"]
            + "_LabelBy"
            + D["LabelBy"]
            + "_Features"
            + D["FeatureMethod"]
            + "_FS"
            + FeatureSelection
            + "_Kernel"
            + D["Kernel"]
            + "_"
            + D["CrossValSubjects"]
            + "Subjects_PieceSize"
            + D["PieceLength"]
        )
        if isPerm:
            savePath = savePath + "_PERMStest"
        saveName = savePath + "\\" + str(n_features) + "_features"
        self.Learningdetails["saveDir"] = savePath
        dir = os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle = int(raw_input("Save Results to pickle? "))
        if isSaveCsv is None:
            isSaveCsv = int(raw_input("save Results to csv? "))
        if isSaveFig is None:
            isSaveFig = int(raw_input("save Results to figure? "))

        if isSavePickle:
            self.ResultsDF.to_pickle(saveName + ".pickle")
            self.BestFeatures.to_pickle(saveName + "_bestFeatures.pickle")

        if isSaveCsv:
            DetailsDF = DF.from_dict(self.Learningdetails, orient="index")
            ResultsCSV = concat(
                [
                    self.ResultsDF,
                    DF(index=["-------Label Details-------"]),
                    self.N,
                    DF(index=["-------Learning Details-------"]),
                    DetailsDF,
                    DF(index=["-------Selected Features Analysis------"]),
                    self.BestFeatures,
                ]
            )
            ResultsCSV.to_csv(saveName + ".csv")

        if isSaveCsv or isSavePickle:
            print("successfully saved as:\n" + saveName)

        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + "Train.png")
            plt.figure(2)
            plt.savefig(saveName + "Test.png")
        plt.close()
        plt.close()
from pathlib import Path
from itertools import chain, repeat
from pandas import DataFrame
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

corpus = DataFrame(columns=["is_negative", "tokens"])

for is_negative, review_path in chain(
    zip(repeat(0), Path("aclImdb/train/pos").iterdir()),
    zip(repeat(1), Path("aclImdb/train/neg").iterdir()),
    zip(repeat(0), Path("aclImdb/test/pos").iterdir()),
    zip(repeat(1), Path("aclImdb/test/neg").iterdir()),
):
    with review_path.open(encoding="UTF-8") as review_file:
        file = str(review_path.relative_to("aclImdb"))
        tokens = word_tokenize(BeautifulSoup(review_file.read()).text)
        corpus.loc[file] = is_negative, tokens
        print(len(corpus))

corpus.to_pickle("corpus.pkl")
                    print (discount)
                    results[discount], avgRMS, randomReturnValues[discount] = getResults(
                        samples, episodes, discount, epsilon, alpha, initValue, softmax
                    )
            elif category == "winratio":
                _, _, randomReturnValues["winratio"] = getResults(
                    samples, episodes, discount, epsilon, alpha, initValue, softmax
                )
                results = randomReturnValues["winratio"]
            else:
                sys.exit()
            print (results)
            results["episode"] = range(1, episodes + 1)
            print predCoords
            dataF = DataFrame(results)
            dataF.to_pickle("data/" + str(predCoords) + str(episodes) + category + str(softmax))
            pickle.dump(
                randomReturnValues,
                open("data/values" + str(predCoords) + str(episodes) + category + str(softmax), "w+"),
            )
        else:
            dataF = pd.read_pickle("data/" + str(predCoords) + str(episodes) + category + str(softmax))
            randomReturnValues = pickle.load(
                open("data/values" + str(predCoords) + str(episodes) + category + str(softmax), "r+")
            )

        ylabel = "Steps"

        ylimG = [30, 180 / len(predCoords)]
        if graphtype == "steps":
            dataF = dataF
Example #12
0
#!/usr/bin/python2.7

from avro.datafile import DataFileReader
from avro.io import DatumReader
from pandas import DataFrame

avro_file = "/Users/srharnett/Downloads/ufo_data/ufo_awesome.avro"
reader = DataFileReader(open(avro_file, "r"), DatumReader())
df = DataFrame(list(reader))
df.to_pickle("ufo_data.pkl")
Example #13
0
def authorization(request):
    client = Client()
    code = request.GET["code"]
    access_token = client.exchange_code_for_token(
        client_id=MY_STRAVA_CLIENT_ID, client_secret=MY_STRAVA_CLIENT_SECRET, code=code
    )

    # making a global variable to be used across views. don't know how this will work in practice

    client = Client(access_token=access_token)
    athlete = client.get_athlete()  # Get current athlete details

    global athleteId
    athleteId = athlete.id

    # if athlete doesn't exist, add them
    if len(Athlete.objects.filter(athleteId=athleteId)) == 0:
        ath = Athlete.objects.create(
            name=str(athlete.firstname + " " + athlete.lastname),
            athleteId=athleteId,
            profilePic=athlete.profile,
            city=athlete.city,
            country=athlete.country,
            sex=athlete.sex,
            premium=athlete.premium,
            created_at=athlete.created_at,
            updated_at=athlete.updated_at,
            followers=athlete.follower_count,
            friends=athlete.friend_count,
            email=athlete.email,
            weight=athlete.weight,
            meas_pref=athlete.measurement_preference,
            runsSummary=DataFrame({}).to_json(orient="records"),
            fitLines=DataFrame({}).to_json(orient="records"),
            masterList=DataFrame({}).to_json(orient="records"),
        )

        ath.profilePic.name = "rudyzPic"
        ath.save(update_fields=["profilePic"])

    # if athlete already exists, draw their file
    elif len(Athlete.objects.filter(athleteId=athleteId)) == 1:
        ath = Athlete.objects.get(athleteId=athleteId)

    ############################################
    ##### compiling new runs, updating summary

    # athlete's existing runs summary
    existingSummary = DataFrame(pd.read_json(ath.runsSummary))
    existingFitlines = DataFrame(pd.read_json(ath.fitLines))
    masterList = DataFrame(pd.read_json(ath.masterList))

    activities = list(client.get_activities())

    # activity IDs of runs already in the system
    try:
        ids = existingSummary.activityId
    except AttributeError:
        ids = []

    for i in range(len(activities)):
        # for i in range(30,37):
        # Ignoring activities already in the system
        if (len(ids) == 0) or (float(activities[i].id) not in list(ids)):

            try:
                # compiling df for raw json-ization
                activityId = activities[i].id
                run = client.get_activity_streams(
                    activityId, types=["time", "latlng", "distance", "heartrate", "altitude", "cadence"]
                )
                latlng = run["latlng"].data
                time = run["time"].data
                distance = run["distance"].data
                heartrate = run["heartrate"].data
                altitude = run["altitude"].data
                cadence = run["cadence"].data
                date = activities[i].start_date_local
                activity = activityId
                dfi = thresher.assemble(date, activityId, heartrate, distance, time, altitude, latlng, cadence)

                # basic cleanup, only removing totally unreasonable values
                dfi = thresher.basicClean(dfi)

                # if we ever want to try our hand at improving strava's speed data (ie by predicting speed when GPS blanks), intervene here:

                # dfi = thresher.addDistDeltas(dfi)

                try:
                    fitline = thresher.getFitlineLws(dfi)  # this adds speed-shifted columns
                except:
                    fitline = pd.DataFrame({})

                try:
                    mafScore = fitline[fitline.hr == 140.0].avgSpeed.iloc[0]
                    print "MAF "
                    print mafScore
                except:
                    mafScore = np.nan

                fitline_json = fitline.to_json(orient="records")

                # getting summary info for run (as one-entry dict)
                runSummary = thresher.getSingleSummaryDf(dfi)

                # adding mafScore to summary
                runSummary["mafScore"] = mafScore

                print runSummary

                # adding predicted hr and speed values
                # dfi = thresher.getPred(dfi)

                # saving entry to database
                Activity.objects.create(
                    act_id=activityId,
                    name=str(activities[i].name),
                    description=activities[i].description,
                    act_type=activities[i].type,
                    date=activities[i].start_date_local,
                    timezone=activities[i].timezone,
                    df=dfi.to_json(orient="records"),
                    avgHr=runSummary["avgHr"],
                    hrVar=runSummary["variation"],
                    realMiles=runSummary["realMiles"],
                    recovery=runSummary["recovery"],
                    easy=runSummary["easy"],
                    stamina=runSummary["stamina"],
                    impulse=runSummary["impulse"],
                    totalTime=runSummary["totalTime"],
                    totalDist=runSummary["totalDist"],
                    climb=runSummary["climb"],
                    fitline=fitline_json,
                    mafScore=mafScore,
                    athlete=ath,
                )

                # updating runs summary
                existingSummary = existingSummary.append(runSummary, ignore_index=True)
                existingFitlines = existingFitlines.append(fitline, ignore_index=True)
                masterList = masterList.append(dfi, ignore_index=True)

            except:
                continue

    # saving updated runs summary to athlete profile
    ath.runsSummary = existingSummary.to_json(orient="records")
    ath.save(update_fields=["runsSummary"])

    existingSummary.to_pickle("runsSummary.txt")

    # saving updated runs summary to athlete profile
    ath.fitLines = existingFitlines.to_json(orient="records")
    ath.save(update_fields=["fitLines"])

    ath.masterList = masterList.to_json(orient="records")
    ath.save(update_fields=["masterList"])

    # testing...
    existingSummary = pd.read_json(ath.runsSummary)
    # print(existingSummary)

    existingFitlines = pd.read_json(ath.fitLines)
    # print(existingFitlines)

    global path
    path = os.path.dirname(__file__)
    # updating dataframe, pickling for use in other views
    # global df
    # df = thresher.masterAssemble(client)

    masterDf = pd.read_json(ath.masterList)
    # print(masterDf)
    masterDf.to_pickle(str(path) + "/" + str(athlete.id) + "masterDf.txt")

    return render(
        request, "stravaChimp/authorization.html", {"code": code, "access_token": access_token, "athleteId": athleteId}
    )
Example #14
0
def hapmap3(data_set="hapmap3"):
    try:
        from pandas import read_pickle, DataFrame
        from sys import stdout
        import bz2
    except ImportError as i:
        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
    if not data_available(data_set):
        download_data(data_set)
    dirpath = os.path.join(data_path, "hapmap3")
    hapmap_file_name = "hapmap3_r2_b36_fwd.consensus.qc.poly"
    preprocessed_data_paths = [
        os.path.join(dirpath, hapmap_file_name + file_name)
        for file_name in [".snps.pickle", ".info.pickle", ".nan.pickle"]
    ]
    if not reduce(lambda a, b: a and b, map(os.path.exists, preprocessed_data_paths)):
        if not overide_manual_authorize and prompt_user(
            "Preprocessing requires 17GB " "of memory and can take a long time, continue? [Y/n]\n"
        ):
            print "Preprocessing required for further usage."
            return
        status = "Preprocessing data, please be patient..."
        print status

        def write_status(message, progress, status):
            stdout.write(" " * len(status))
            stdout.write("\r")
            stdout.flush()
            status = r"[{perc: <{ll}}] {message: <13s}".format(
                message=message, ll=20, perc="=" * int(20.0 * progress / 100.0)
            )
            stdout.write(status)
            stdout.flush()
            return status

        unpacked_files = [os.path.join(dirpath, hapmap_file_name + ending) for ending in [".ped", ".map"]]
        if not reduce(lambda a, b: a and b, map(os.path.exists, unpacked_files)):
            status = write_status("unpacking...", 0, "")
            curr = 0
            for newfilepath in unpacked_files:
                if not os.path.exists(newfilepath):
                    filepath = newfilepath + ".bz2"
                    file_size = os.path.getsize(filepath)
                    with open(newfilepath, "wb") as new_file, open(filepath, "rb") as f:
                        decomp = bz2.BZ2Decompressor()
                        file_processed = 0
                        buffsize = 100 * 1024
                        for data in iter(lambda: f.read(buffsize), b""):
                            new_file.write(decomp.decompress(data))
                            file_processed += len(data)
                            write_status("unpacking...", curr + 12.0 * file_processed / (file_size), status)
                curr += 12
                status = write_status("unpacking...", curr, status)
        status = write_status("reading .ped...", 25, status)
        # Preprocess data:
        snpstrnp = np.loadtxt("hapmap3_r2_b36_fwd.consensus.qc.poly.ped", dtype=str)
        status = write_status("reading .map...", 33, status)
        mapnp = np.loadtxt("hapmap3_r2_b36_fwd.consensus.qc.poly.map", dtype=str)
        status = write_status("reading relationships.txt...", 42, status)
        # and metainfo:
        infodf = DataFrame.from_csv("./relationships_w_pops_121708.txt", header=0, sep="\t")
        infodf.set_index("IID", inplace=1)
        status = write_status("filtering nan...", 45, status)
        snpstr = snpstrnp[:, 6:].astype("S1").reshape(snpstrnp.shape[0], -1, 2)
        inan = snpstr[:, :, 0] == "0"
        status = write_status("filtering reference alleles...", 55, status)
        ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0, 1)[:, :, :]))
        status = write_status("encoding snps...", 70, status)
        # Encode the information for each gene in {-1,0,1}:
        status = write_status("encoding snps...", 73, status)
        snps = snpstr == ref[None, :, :]
        status = write_status("encoding snps...", 76, status)
        snps = snps * np.array([1, -1])[None, None, :]
        status = write_status("encoding snps...", 78, status)
        snps = snps.sum(-1)
        status = write_status("encoding snps", 81, status)
        snps = snps.astype("S1")
        status = write_status("marking nan values...", 88, status)
        # put in nan values (masked as -128):
        snps[inan] = -128
        status = write_status("setting up meta...", 94, status)
        # get meta information:
        metaheader = np.r_[["family_id", "iid", "paternal_id", "maternal_id", "sex", "phenotype"]]
        metadf = DataFrame(columns=metaheader, data=snpstrnp[:, :6])
        metadf.set_index("iid", inplace=1)
        metadf = metadf.join(infodf.population)
        metadf.to_pickle(preprocessed_data_paths[1])
        # put everything together:
        status = write_status("setting up snps...", 96, status)
        snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:, 1])
        snpsdf.to_pickle(preprocessed_data_paths[0])
        status = write_status("setting up snps...", 98, status)
        inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:, 1])
        inandf.to_pickle(preprocessed_data_paths[2])
        status = write_status("done :)", 100, status)
        print ""
    else:
        print "loading snps..."
        snpsdf = read_pickle(preprocessed_data_paths[0])
        print "loading metainfo..."
        metadf = read_pickle(preprocessed_data_paths[1])
        print "loading nan entries..."
        inandf = read_pickle(preprocessed_data_paths[2])
    snps = snpsdf.values
    populations = metadf.population.values.astype("S3")
    hapmap = dict(
        name=data_set,
        description="The HapMap phase three SNP dataset - "
        "1184 samples out of 11 populations. inan is a "
        "boolean array, containing wheather or not the "
        "given entry is nan (nans are masked as "
        "-128 in snps).",
        snpsdf=snpsdf,
        metadf=metadf,
        snps=snps,
        inan=inandf.values,
        inandf=inandf,
        populations=populations,
    )
    return hapmap
def run_bl_analysis(pickles_folder=0):
    import matplotlib.pyplot as plt
    from os import listdir
    from os.path import join
    from pandas import read_pickle, DataFrame
    from article2_time_resolved_routines import find_nearest

    if not pickles_folder:
        pickles_folder = "/home/carlos/Documents/PhD/Articles/" + "Article_3/Scripts/time_resolved/averaged_data"

    case_pickles = [
        f
        for f in listdir(pickles_folder)
        if f.endswith(".p")
        if not "Slit" in f and "alpha0" in f and "phi0" in f and not "mean_flow_rotated" in f
    ]

    bl_df = DataFrame()

    fig, ax = plt.subplots(1, 1)

    for cp in case_pickles:
        case_bl_df = DataFrame()

        df = read_pickle(join(pickles_folder, cp))
        df = df.sort_values(by=["x", "y"])

        if "loc00" in cp and not "STE" in cp:
            x_bl_loc = 38
        elif "loc05" in cp:
            x_bl_loc = 18
        elif "loc10" in cp or "STE" in cp:
            x_bl_loc = -2

        available_x_loc = find_nearest(x_bl_loc, df.x.values)

        trailing_edge, phi, alpha, U, z = decript_case_name(cp)

        case_name = "{0}_a{1}_p{2}_U20_z{3:02.0f}_tr".format(trailing_edge, alpha, phi, float(z) * 20)

        print "   Running {0}".format(case_name)

        # First get the edge velocity, because it needs to be cleaned up a bit #
        ue_df = DataFrame()
        for x in df.x.unique():
            local_x_df = df[(df.x == x) & (df.y >= 0)]

            ue_df = ue_df.append({"U_e": get_edge_velocity(local_x_df), "x": x}, ignore_index=True)
        # ######################################################################

        ue_df = clean_data(ue_df, "U_e", window=10, threshold=1.0)

        for x, U_e_x in zip(ue_df.x.values, ue_df.U_e.values):
            local_x_df = df[(df.x == x) & (df.y >= 0) & (df.y < 20)]

            if x == available_x_loc:
                ax.plot(local_x_df.u / U_e_x, local_x_df.y, label=cp.replace("_", " "))

            U_e_loc, delta_99, delta_displacement, delta_momentum = get_boundary_layer_values(local_x_df, U_e_x)

            data = {
                "case": case_name,
                "Ue": U_e_x,
                "delta_99": delta_99,
                "delta_displacement": delta_displacement,
                "delta_momentum": delta_momentum,
                "x": x,
                "trailing_edge": trailing_edge,
                "phi": phi,
                "alpha": alpha,
                "z": z,
            }

            case_bl_df = case_bl_df.append(DataFrame(data, index=[0]), ignore_index=True)

        if "delta_99" in case_bl_df.columns:
            case_bl_df = clean_data(case_bl_df, "delta_99", window=10, threshold=1.0)

        bl_df = bl_df.append(case_bl_df, ignore_index=True)

    bl_df.to_pickle("BLData_staged.p")

    plt.legend(loc="best")
    plt.xlim(0, 1)
    plt.savefig("InterestingBLs.png")
    bioactivities_df.to_pickle(os.path.join(output_path, "chembl_bioactivities_of_" + acc + ".pkl"))

    # TODO: Join compound data with bioactivity data.
    print "Joining compound dataframes..."
    df = pd.merge(bioactivities_df, compounds_df, on="ingredient_cmpd_chemblid")
    print df.columns

    # you may want to add a checking step to check if there is any bioactivity data
    # with other units than nM

    # Bioactivity that has Ki, IC50, Kd, Kd1, Kd2 type measurements in nM units
    print "Filtering for biophysical data..."
    df2 = df[
        (
            (df.bioactivity_type == "Ki")
            | (df.bioactivity_type == "IC50")
            | (df.bioactivity_type == "Kd")
            | (df.bioactivity_type == "Kd1")
            | (df.bioactivity_type == "Kd2")
        )
        & (df.units == "nM")
    ]
    print "For acc %s there are %d Ki/IC50/Kd type bioactivity data." % (acc, len(df2.index))

    # Summarized table for bioactivity that has Ki, IC50, Kd, Kd1, Kd2 type measurements in nM units
    df2_summary = df2[["ingredient_cmpd_chemblid", "smiles", "bioactivity_type", "operator", "value", "units"]]

    # Writing df2_summary into to a pickle file
    df2_summary.to_pickle(os.path.join(output_path, "chembl_bioact_ki_ic50_kd_summary_of_" + acc + ".pkl"))

    # Approved drugs for this target
    drugs = s.get_approved_drugs(str(target_chembl_id), frmt="json")
    df_drugs = DataFrame(drugs["approvedDrugs"])
    print "For acc %s there are %d approved drugs." % (acc, len(df_drugs))

    # Writing approved drugs for each target into to a pickle file
    df_drugs.to_pickle(os.path.join(output_path, "chembl_approved_drugs_of_" + acc + ".pkl"))
Example #17
0
    def getFeatures(self, FeatureMethod, Data, n_quantas=4, n_clusters=7):
        if not FeatureMethod:
            FeatureMethod = raw_input("Choose feature method (Moments,Quantization) : ")

        print("\nCalculating Features for each subject...")
        # init
        FeaturesArray = np.array([])
        FeaturesDF = DF()
        method = FeatureMethod
        # loob over subjects
        PieceSize = int(raw_input("Piece Size= (CHANGE THIS TO MANUAL IN myClasses 125) "))
        SubjectList = list(set(Data.index.get_level_values("subject")))
        for subject in SubjectsList:
            print(subject)
            subjectData = Data.loc[subject]  # raw subject data
            fs_signal = (
                subjectData.columns
            )  # continue here- make sure the features and labels are sink in different piece_ind.
            numOfPieces = range(np.round(len(subjectData) / PieceSize))[:-1]
            piecesIndexDict = {}
            piecesName = {}
            for i in numOfPieces:
                piecesName[i] = PieceSize * i
                piecesIndexDict[piecesName[i]] = range(PieceSize * i, PieceSize * (i + 1))
                subjectDataCutted = subjectData
            piecesIndex = list(piecesIndexDict.itervalues())
            piecesName = list(piecesName.itervalues())
            #
            subjectPiecesIndex = MultiIndex.from_product([SubjectsList, piecesName], names=["subject", "Piece_ind"])
            # loob over pieces
            for piece in piecesIndexDict:
                pieceRange = piecesIndexDict[piece]  # change this to PieceName, if data is already cutted
                sData = subjectData.loc[pieceRange]

                # calc features
                if FeatureMethod == "Moments":
                    FeatureTypeList = ["M1", "M2", "Skew", "Kurtosis"]
                    M1 = getFeatureDF(sData, np.mean, "m1")
                    M2 = getFeatureDF(sData, np.std, "m2")
                    Skew = getFeatureDF(sData, sstats.skew, "Skew")
                    Kurtosis = getFeatureDF(sData, sstats.kurtosis, "Kurtosis")
                    subjectFeatures = concat([M1, M2, Kurtosis, Skew])

                    if FeaturesDF.empty:
                        FeaturesDF = DF(columns=subjectPiecesIndex, index=subjectFeatures.index)
                    FeaturesDF[subject, piece] = subjectFeatures

                if FeatureMethod == "Quantization":
                    isQuantizedData = int(raw_input("is your data quantized?? "))
                    if not (isQuantizedData):
                        print("Data should be quantized in order to calculate Quantization features!!")
                        break
                    else:
                        FeatureTypeList = [
                            "ExpressionRatio",
                            "ExpressionLevel",
                            "ExpressionLength",
                            "ChangeRatio",
                            "FastChangeRatio",
                        ]
                        #                if 'quantizedDF' in self.data: %TODO- fix this
                        #                    self.data.calcQuantize()
                        k = n_quantas
                        quantizedData = Data  # TODO, make sure it is also divided to pieces
                        sData = quantizedData.loc[subject].loc[pieceRange]  # subject data
                        cols = quantizedData.columns

                        # calc features using quantized vector:
                        ExpressionRatio, ExpressionLevel, ExpressionLength, ChangeRatio, FastChangeRatio = featuresUtils.calcQuantizationFeatures(
                            sData, k
                        )
                        FeaturesDict = {
                            "ExpressionRatio": ExpressionRatio,
                            "ExpressionLevel": ExpressionLevel,
                            "ExpressionLength": ExpressionLength,
                            "ChangeRatio": ChangeRatio,
                            "FastChangeRatio": FastChangeRatio,
                        }
                        multInd = MultiIndex.from_product([FeatureTypeList, cols], names=["FeatureType", "fs-signal"])
                        subjectFeatures = DF(
                            concat(
                                [ExpressionRatio, ExpressionLevel, ExpressionLength, ChangeRatio, FastChangeRatio]
                            ).values,
                            index=multInd,
                        )
                        if FeaturesDF.empty:
                            FeaturesDF = DF(columns=sgetubjectPiecesIndex, index=subjectFeatures.index)
                        FeaturesDF[subject, piece] = subjectFeatures

                if FeatureMethod == "kMeansClustering":
                    n_clusters = 7
                    isClusteredData = int(raw_input("is your data clustered?? "))
                    if not (isClusteredData):
                        print("Data should be clustered in order to calculate clustering features!!")
                        break
                    print("clustering data... num of clusters = " + str(n_clusters))
                    clusteredData = Data
                    sData = clusteredData.loc[subject]
                    subjectFeatures = featuresUtils.calckMeansClusterFeatures(
                        sData, n_clusters
                    )  # TODO make sure it works for segmented Data
                    if FeaturesDF.empty:
                        FeaturesDF = DF(columns=subjectPiecesIndex, index=subjectFeatures.index)
                    FeaturesDF[subject, piece] = subjectFeatures

        # PreProcessing over all features (normalizeation and dropna())
        GetNormRows = lambda x: (x - x.mean()) / x.std()
        ProcessedFeaturesDF = self.FeaturesDF.apply(GetNormRows, axis=1)  # normalize each feature over all subjects
        ProcessedFeaturesDF.fillna(
            0, inplace=True
        )  # make sure there are no inappropriate Nans  #TODO - this should be removed, and get the same FeaturesDF as in 'getMissingFeatures'
        if FeatureMethod == "Quantization":
            ProcessedFeaturesDF = featuresUtils.getMissingFeatures(self)  # calc the real NaNs

        # multIndex=MultiIndex.from_product([FeatureTypeList,fs_signal],names=['FeatureType','fs-signal'])
        # FeaturesArray=StandardScaler().fit_transform(FeaturesArray.T) #normalize each feature over all subjects.
        # FeaturesDF=DF(FeaturesArray.T,index=multIndex,columns=self.SubjectsList)
        # self.FeaturesDF.index.set_names(names=['FeatureType','fs-signal']) #TODO - make sure this works!

        if isSave:
            FaturesPath = raw_input("enter features Path: ")
            print("\nsaving to pickle...")
            FeaturesDF.to_pickle(open(FeaturesPath + "raw.pickle", "wb"))
            ProcessedFeaturesDF.to_pickle(open(FeaturesPath + ".pickle", "wb"))
            print("saving to csv...")
            FeaturesDF.to_csv(self.FeaturesPath + "rawDF.csv")
            ProcessedFeaturesDF.to_csv(self.FeaturesPath + "DF.csv")
            print("All Features Data successfully saved to " + FeaturesPath)
Example #18
0
			elo_timeseries.ix[tourney_date_timestamp, loser_id]= new_elo_loser
			
	##Uncomment to output year end elo_rankings for every year between 1968 and 2015
	#output_file_name = str(current_year) + '_yr_end_elo_ranking.csv'
	#players.to_csv(output_file_name)

	current_year = current_year + 1


players.to_csv('2015_yr_end_elo_ranking.csv')
players = pandas.read_csv('2015_yr_end_elo_ranking.csv')
#Print all-time top 10 peak_elo
print players.sort(columns= 'peak_elo', ascending=False)[:10]

#Save elo_timeseries dataframe for plotting purposes
elo_timeseries.to_pickle('elo_timeseries.pkl')

#Open saved pickle file and save into a dataframe
elo_timeseries = pandas.read_pickle('elo_timeseries.pkl')

#Convert objects within elo_timeseries dataframe to numeric
elo_timeseries = elo_timeseries.convert_objects(convert_numeric=True)

#Use linear interpolation for elo_ratings
elo_timeseries = elo_timeseries.interpolate(method='linear')

#Store the indices in the elo_timeseries in a list
index_timestamp = list(elo_timeseries.index.values)

#Get rid of elo ratings since known last_tourney_date
for player in elo_timeseries_col_header:
Example #19
0
    "message_id": Series(message_ids),
    "convthread_id": Series(convthread_ids),
    "message": Series(messages),
    "popularity": Series(popularities),
}
d = DataFrame(d)
d.to_pickle("messages.pkl")

# thread
d = {
    "convthread_id": Series(convthread_ids),
    "title": Series(convthread_titles),
    "lat": Series(lats),
    "lng": Series(lngs),
}
d = DataFrame(d)
d.to_pickle("convthreads.pkl")

# taggings
d = {
    "tagging_id": Series(tagging_ids),
    "convthread_id": Series(tagging_convthread_ids),
    "tag_id": Series(tagging_tag_ids),
}
d = DataFrame(d)
d.to_pickle("taggings.pkl")

# tags
d = {"tag_id": Series(tag_ids), "tag": Series(tags)}
d = DataFrame(d)
d.to_pickle("tags.pkl")
Example #20
0
alpha = 0.1
initValue = 15
theta = 0.00001
softmax = False
skip = False

game = pg.PredatorGame((0, 0), (5, 5), (11, 11))
if not skip:

    results = dict()
    for initValue in [0, 1, 10, 15]:
        results[initValue] = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
        print initValue
    results["episode"] = range(0, episodes)
    dataF = DataFrame(results)
    dataF.to_pickle("data/initValues" + str(episodes))
else:
    dataF = pd.read_pickle("data/initValues" + str(episodes))

episodeData = pd.melt(dataF, id_vars=["episode"], var_name="initValue")

# plt.ioff()
# x = qplot(range(0,4), [0.68834, 0.76024, 0.82407, 0.82113], geom = ["point", "line"])
# print x
# print qplot([0,1], [0.68834, 0.76024])
p = (
    ggplot(episodeData, aes("episode", "value", color="initValue"))
    + geom_line()
    + theme_bw()
    + theme()
    + ylab("Steps")
Example #21
0
        for epsilon in parametersFor(category):
            results[epsilon] = getResults(samples, episodes, discount, epsilon, decay)
    elif category == "decay":
        for decay in parametersFor(category):
            results[decay] = getResults(samples, episodes, discount, epsilon, decay)
    elif category == "discount":
        for discount in parametersFor(category):
            print (discount)
            results[discount] = getResults(samples, episodes, discount, epsilon, decay)
    else:
        sys.exit()
    print (results)
    results["episode"] = range(1, episodes + 1)

    dataF = DataFrame(results)
    dataF.to_pickle("data/" + str(episodes) + category + "small")
    # pickle.dump(randomReturnValues, open('data/values'+str(episodes)+category+str(softmax), 'w+'))
else:
    dataF = pd.read_pickle("data/" + str(episodes) + category)
    # randomReturnValues = pickle.load(open('data/values'+str(episodes)+category+str(softmax), 'r+'))

print dataF
if smoothing:
    for par in parametersFor(category):
        dataF[par] = scipy.ndimage.filters.gaussian_filter(dataF[par], 5 * (episodes / 4000), 0)
episodeData = pd.melt(dataF, id_vars=["episode"], var_name=category)

ylabel = "Steps"

p = (
    ggplot(episodeData, aes("episode", "value", color=category))
N = 1
if not skip:
    for i in range(N):
        print i

        averageQ, predwinsratioQ = getIndependentQLearning()

        averageS, predwinsratioS = getIndependentSarsa()

    data["IndependentQLearning"] = predwinsratioQ
    data["IndependentSarsa"] = predwinsratioS
    data["episode"] = range(1, episodes + 1)

    dataF = DataFrame(data)
    dataF.to_pickle("data/comparison")
else:
    dataF = pd.read_pickle("data/comparison")

for a in alg:
    dataF[a] = scipy.ndimage.filters.gaussian_filter(dataF[a], 5 * (episodes / 4000), 0)


episodeData = pd.melt(dataF, id_vars=["episode"], var_name="Algorithm")

p = (
    ggplot(episodeData, aes("episode", "value", color="Algorithm"))
    + geom_line()
    + theme_bw()
    + theme()
    + ylab("Win ratio")