def form():
    file_path = ""
    # load the model
    cnn = keras.models.load_model('dr.h5')
    try:
        # get the image
        if request.method == "POST":
            image = request.files['image']
            img_name = image.filename
            file_path = os.path.join('./static/uploaded_images', img_name)
            image.save(file_path)
            # preprocess the image to make it similar
            # to training data
            a = Preprocess()
            a.preprocess(file_path)
            # image is converted to grayscale
            # and then to numpy array
            image = Image.open('./static/uploaded_images/preprocessed.jpeg')
            image = ImageOps.grayscale(image)
            img_arr = img.img_to_array(image)
            img_arr = img_arr.astype("float32")
            img_arr = img_arr / 255.0
            img_arr = np.expand_dims(img_arr, axis=0)
            # prediction
            predict = cnn.predict(img_arr)
            pred = np.argmax(predict[0])
            os.remove(file_path)
            os.remove('./static/uploaded_images/preprocessed.jpeg')
            return render_template("index.html", image_name=pred)
        else:
            return render_template("index.html", image_name="None")
    except:
        return render_template("index.html",
                               image_name="No proper image file selected")
Example #2
0
 def __init__(self, p, q, lag):
     self.preprocess = Preprocess(lag=lag)
     self.p = p  # order of residual term
     self.q = q  # order of variance term
     self.omega = np.array([])
     self.alpha = np.empty(shape=(0, self.p))  # residual term parameter
     self.beta = np.empty(shape=(0, self.q))  # variance term parameter
    def classify(self,img_mdf):
        """ Execute the model of neuronal network
        Returns if the image is an ostrich.

            Args:
                img_mdf(numpy array). Array of image data

            Returns:
                True. If the max value of precitions is ostrich
        """

        if not os.path.exists(os.path.join(h5_path,"ostrich.h5")):
            print("Invalid path")
            return

        model = keras.models.load_model(os.path.join(h5_path,"ostrich.h5"))
        preprocess = Preprocess()
        img_mdf = preprocess.prepare_image(img_mdf,60)
        predictions = model.predict(img_mdf)

        max = np.argmax(predictions)

        categories = ["ostrich","not an ostrich"]

        result = categories[max]

        if (result == "ostrich"):
            return True
        else:
            return False
Example #4
0
def preprocessData():
    preprocessing = Preprocess(data="fundamental_ratios")
    print("retrieving fundamental ratios...")
    fr_train, fr_validate = preprocessing.get_data(dataType="scaled",
                                                   dset="train_validate")
    print("retrieving returns...")
    ar = preprocessing.retrieve_return()
    print("split returns...")
    ar_train = ar[ar.index.isin(fr_train.index)]
    ar_validate = ar[ar.index.isin(fr_validate.index)]
    print("trim fundamental ratios...")
    fr_train = fr_train[fr_train.index.isin(ar_train.index)]
    fr_validate = fr_validate[fr_validate.index.isin(ar_validate.index)]
    # remove boundary values
    print(ar_train)
    ar_train.drop(ar_train.nlargest(250, "return").index, axis=0, inplace=True)
    ar_train.drop(ar_train.nsmallest(250, "return").index,
                  axis=0,
                  inplace=True)
    # re-order train set for visualization
    ar_train = ar_train.sort_values("return")
    fr_train = fr_train.loc[ar_train.index]
    train = (fr_train, ar_train)
    # re-order validation set for visualization
    ar_validate = ar_validate.sort_values("return")
    fr_validate = fr_validate.loc[ar_validate.index]
    validate = (fr_validate, ar_validate)
    return train, validate
Example #5
0
def test():
    import pandas as pd
    import numpy as np
    from Preprocess import Preprocess
    train = pd.read_csv('~/Downloads/ds-project-train.csv',
                        dtype={
                            'SHIPPER.ADDRESS': np.str,
                            'ZIPCODE': np.str
                        },
                        parse_dates=['ARRIVAL.DATE'])
    test = pd.read_csv('~/Downloads/ds-project-test.csv',
                       dtype={
                           'SHIPPER.ADDRESS': np.str,
                           'ZIPCODE': np.str
                       },
                       parse_dates=['ARRIVAL.DATE'])
    p = Preprocess()
    X_train = p.run(df=train)
    X_test = p.run(df=test, test=True)

    y_train = X_train['COUNTRY.OF.ORIGIN']
    X_train = X_train.drop(['COUNTRY.OF.ORIGIN'], axis=1)

    y_test = X_test['COUNTRY.OF.ORIGIN']
    X_test = X_test.drop(['COUNTRY.OF.ORIGIN'], axis=1)

    fe = FeatureEngineering()
    X_train = fe.run(df=X_train)
    X_test = fe.run(df=X_test, test=True)

    print('!')
Example #6
0
    def classify(self, img_test):
        """ Execute the model of neuronal network and return true if the image is a 
            guinea pig.

            Args:
                img_mdf(numpy array). Array of image data

            Returns:
                True. If the max value of precitions is guinea pig
        """

        save_path = os.path.join(h5_path, "guineapig.h5")

        preprocess = Preprocess()

        img = preprocess.prepare_image(img_test, 60)

        if not os.path.exists(save_path):
            raise FileExistsError("File guineapig.h5 was deleted. :(")

        model = keras.models.load_model(save_path)

        predictions = model.predict(img)

        max = np.argmax(predictions)

        categories = ["guinea pig", "not guinea pig"]

        result = categories[max]

        if (result == "guinea pig"):
            return True
        else:
            return False
Example #7
0
class InformationRatio:
    def __init__(self, benchmark="snp500"):
        self.benchmark = benchmark
        self.preprocess = Preprocess()
        self.alpha = None
        self.index = None

    def computeInformationRatio(self, portfolio):
        returns = self.preprocess.retrieve_return()
        index = self.preprocess.retrieve_benchmark_change("snp500")
        if self.alpha is None:
            alpha = post.compute_alpha(
                index, returns).loc[portfolio.keys()]["alpha"].values
            self.alpha = alpha
        else:
            alpha = self.alpha
        #print("alpha:", alpha)
        weight = np.array(list(portfolio.values()))
        #print("weight", weight)
        portfolio_return = np.sum(np.multiply(alpha, weight))
        #print("portfolio return", portfolio_return)
        volatility = np.std(alpha)
        #print("volatility", volatility)
        if self.index is None:
            index = self.preprocess.retrieve_benchmark_change(
                self.benchmark) - 1
            self.index = index
        else:
            index = self.index
        #print("benchmark", index)
        information_ratio = (portfolio_return - index) / volatility
        return information_ratio
Example #8
0
 def test_retrieve_mkt_caps(self):
     self.preprocess = Preprocess(lag=7)
     try:
         df = self.preprocess.retrieve_mkt_caps(["GE", "MMM", "APPL"])
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Example #9
0
 def test_retrieve_dividends(self):
     self.preprocess = Preprocess(lag=7)
     try:
         df = self.preprocess.retrieve_dividends()
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Example #10
0
 def test_retrieve_fundamental_ratios(self):
     self.preprocess = Preprocess()
     try:
         df = self.preprocess.retrieve_fundamental_ratios()
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Example #11
0
 def test_retrieve_benchmark(self):
     self.preprocess = Preprocess(lag=30)
     try:
         df = self.preprocess.retrieve_benchmark("snp500")
         if not isinstance(df, pd.DataFrame) and not df.empty:
             raise Exception
     except:
         self.fail()
Example #12
0
 def test_retrieve_benchmark_change(self):
     self.preprocess = Preprocess(lag=7)
     try:
         change = self.preprocess.retrieve_benchmark_change("snp500")
         if not isinstance(change, float):
             raise Exception
     except:
         self.fail()
Example #13
0
 def __init__(self, asset, risk_free=0):
     self.preprocess = Preprocess()
     self.asset = asset
     self.risk_free = risk_free
     self.covariance = None  # type: pd.DataFrame
     self.mean = None  #pd.Series(index=asset)
     self.max_sharpe_comp = None  # maximum sharpe portfolio composition
     self.min_vol_comp = None  # minimum volatility portfolio composition
Example #14
0
class OptionPair:
    """constructor
    """
    def __init__(self):
        self.preprocess = Preprocess(lag=70)

    """compute_correlation
        Description:
            compute correlation between all stocks
    """

    def compute_correlation(self):
        daily_price = self.preprocess.retrieve_open_close()
        daily_change = post.compute_daily_change(daily_price)
        return daily_change.corr(method='pearson', min_periods=30)

    """find_movement_pairs
        Description:
            find stock pairs with high daily movement correlation
        Input:
            corr: correlation matrix of all stocks
            threshold: correlation coefficient threshold
    """

    @staticmethod
    def find_movement_pairs(corr, threshold=0.95):
        pairs = []
        for symbol in corr:
            for (i, v) in corr[symbol].iteritems():
                if i == symbol:
                    continue
                else:
                    if abs(v) > threshold:
                        pairs.append((symbol, i, v))
        return pairs

    """narrow_growth_pairs
        Description:
            Even highly correlated daily movement pair will produce long-term growth drift, 
            this method narrows the correlation pairs to those that have similar growth over time.
        Input:
            pairs: best daily movement correlation pairs
            threshold: growth drift threshold
    """

    def narrow_growth_pairs(self, pairs, threshold=0.05):
        returns = self.preprocess.retrieve_return()

        for pair in pairs:
            try:
                r1 = returns.loc[pair[0], "return"]
                r2 = returns.loc[pair[1], "return"]
                drift = abs(r1 - r2) / abs((r1 + r2) / 2)
            except KeyError:  # remove pair if return cannot be validated
                drift = 1
            if drift > threshold:
                pairs.remove(pair)
        return pairs
Example #15
0
 def test_scale_data(self):
     self.preprocess = Preprocess(lag=7)
     data = [('x', [1, 2, 3, 4]), ('y', [51, -6, 43, -8])]
     df = pd.DataFrame.from_items(data)
     scaled = self.preprocess.scale_data(df)
     self.assertTrue(scaled['x'].max() <= 1)
     self.assertTrue(scaled['y'].max() <= 1)
     self.assertTrue(scaled['x'].min() >= 0)
     self.assertTrue(scaled['y'].min() >= 0)
Example #16
0
 def test_retrieve_return(self):
     self.preprocess = Preprocess(lag=7)
     try:
         df1 = self.preprocess.retrieve_return(
         )  # non-split is a super set of split returns
         if not isinstance(df1, pd.DataFrame) and not df1.empty:
             raise Exception
     except:
         self.fail()
Example #17
0
 def __init__(self, topics_filename, dir_name):
     self.parser = TopicsParser()
     self.preProcess = Preprocess()
     self.preprocessed = True
     self.topics_parsed = self.parser.get_data(topics_filename)
     self.topics = dict()
     for topic in self.topics_parsed:
         self.topics[topic['num']] = " ".join(
             self.preProcess.preprocess(topic['title'] + ' ' +
                                        topic['narr'] + ' ' +
                                        topic['desc']))
Example #18
0
 def test_filter_column(self):
     self.preprocess = Preprocess(density=0.5, lag=7)
     data = [('symbol', ['A', 'B', 'C', 'D']), ('index', [150, 200, 50,
                                                          10]),
             ('date', [200, 210, 90, 20]), ('currency', [140, 215, 95, 30]),
             ('latestadate', [140, 215, 95, 40]),
             ('dense', [140, 215, np.NaN, 50]),
             ('sparse', [np.NaN, np.NaN, np.NaN, 60])]
     df = pd.DataFrame.from_items(data)
     filtered = self.preprocess.filter_column(df)
     self.assertEqual(len(filtered.columns),
                      2)  # only symbol and dense will survive
Example #19
0
 def __init__(self, load, dir_name, files):
     self.preProcess = Preprocess()
     self.documentParser = DocumentParser()
     self.preprocessed = True
     if not load:
         if not os.path.isdir(dir_name):
             os.mkdir(dir_name)
         schema = Schema(id=TEXT(stored=True), content=TEXT(stored=True))
         self.ix = create_in(dir_name, schema)
         self.index(files)
     else:
         self.ix = open_dir(dir_name)
Example #20
0
	def __init__(self, df = None,
				continuous_features=[],
				unordered_categorical_features = [],
				ordered_categorical_features = []):
		self.data = df
		self.continuous_features = continuous_features
		self.unordered_categorical_features = unordered_categorical_features
		self.ordered_categorical_features = ordered_categorical_features
		self.prprcs = Preprocess()
		self.fs = FieldStatistics()
		self.trnsfrmr = Transformer()
		self.imptr = ImputeData()
		self.pltr = Plotting()
Example #21
0
def main():
    '''
    Main function to preprocess data. This function uses the Preprocess class.
    '''
    dataDirectory = './Data/ToProcessData'
    preprocessor = Preprocess(dataDirectory)
    print('Object created')
    st = time.time()
    preprocessor.PreprocessData()
    # preprocessor.PrepareTrainTestSet()
    print('Preprocess data execution ended')
    en = time.time()
    print('Time taken to process data = ', en - st, ' sec')
 def contours2(img):
     # equalize Y channel hist
     # img = Preprocess.equalizeHistYChannel(img)
     # remove artifacts
     # img = Preprocess.removeArtifact(img)
     # remove RGB artifact
     img = Preprocess.removeArtifactYUV(img)
     # apply OTSU threshold
     ret, thresh = Preprocess.OTSUThreshold(img)
     # search for contours and select the biggest one
     c, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE,
                                               cv2.CHAIN_APPROX_NONE)
     cnt = max(contours, key=cv2.contourArea)
     return cnt
Example #23
0
    def __init__(self):
        '''
        初始化
        新建对象时默认执行
        '''

        # ## 窗口

        # 主窗口
        self.mainWindow = MainWindow()

        # 信息窗口
        self.informationMessageApp = QtWidgets.QWidget()
        self.informationMessageWindow = MessageWindow()

        # 服务项
        self.preprocess = Preprocess()
        self.process = Process()
        self.postprocess = Postprocess()
        self.attachmentMatch = AttachmentMatch()
        self.jsonService = JsonService()

        self.settingJsonService = JsonService('settings.json')

        # ## 连接 Slots 和 Signals

        # 快速处理/开始: 按下 --> 快速处理
        self.mainWindow.expressProcessButton.pressed.connect(
            self.expressProcess)

        # 开始处理/开始: 按下 --> 一般处理
        self.mainWindow.generalProcessButton.pressed.connect(
            self.generalProcess)

        # 开始处理/附件匹配选择框: 状态变化 --> 附件选择是否生效
        self.mainWindow.shouldMatchAttachmentCheckBox.stateChanged.connect(
            self.shouldEnableAttachmentMatch)

        # 开始处理/原始数据浏览: 按下 --> 选择原始数据文件
        self.mainWindow.generalProcessOriginalDataExploreButton.pressed.connect(
            self.exploreOriginalDataFile)

        # 开始处理/附件目录浏览: 按下 --> 选择附件所在目录
        self.mainWindow.generalProcessAttachmentLocationExploreButton.pressed.connect(
            self.exploreAttachmentDirectory)

        # 开始处理/导出数据浏览: 按下 --> 选择导出数据文件
        self.mainWindow.generalProcessExportFileExploreButton.pressed.connect(
            self.exploreExportDataFile)
 def __init__(self, name,topics_filename,relevance_filename, preprocessed):
     super().__init__(name)
     self.topicIndex = TopicsIndex('Topic Index', self,topics_filename,relevance_filename, preprocessed)
     self.documentParser = DocumentParser()
     self.docLength = 0
     self.dictionary={}
     self.size = 0
     self.preProcess=Preprocess()
     self.evalDocs = self.calcEvaluatedDocs()
     self.preprocessed = preprocessed
     self.processingTime = 0 
     self.indxingTime = 0
     self.processingMemory = 0
     self.indexingMemory = 0
     self.document_lengths=dict()
Example #25
0
 def colorSDG(img, contour):
     '''
         calculate the Standard Deviation Grayscale
     '''
     # remove artifact
     img = Preprocess.removeArtifactYUV(img)
     # extract the lesion
     lesion = Caracteristics.extractLesion(img, contour)
     # convert img to gray
     lesion = cv2.cvtColor(lesion, cv2.COLOR_RGB2GRAY)
     # get bounding rect
     x, y, w, h = cv2.boundingRect(contour)
     # crop the rect
     lesion = lesion[y:y + h, x:x + w]
     # lesion area
     lesionArea = cv2.contourArea(contour)
     # sum of pixels
     s = np.sum(lesion)
     # get mean color value
     mean = s // lesionArea
     # calculate SDG
     lesion[lesion != 0] = np.subtract(lesion[lesion != 0], mean)
     lesion = np.power(lesion, 2)
     SDG = np.sum(lesion)
     # SDG = 0
     # for i in range(0, h):
     #     for j in range(0, w):
     #         if lesion[i, j] != 0:
     #             SDG = SDG + ((lesion[i, j] - mean)**2)
     SDG = np.sqrt((1 / lesionArea) * SDG)
     SDG = round(SDG, 2)
     return SDG
 def asymmetryIndex(img, contour):
     '''
         get asymmetry index
         search for homologue of each pixel
     '''
     # remove artifact
     img = Preprocess.removeArtifactYUV(img)
     # convert img to gray
     imgray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
     # get bounding rect
     x, y, w, h = cv2.boundingRect(contour)
     # crop the rect
     rect = imgray[y:y + h, x:x + w]
     # rotate 180°
     rotated = imutils.rotate_bound(rect, 180)
     # intersection between rect and rotated (search)
     intersection = cv2.bitwise_and(rect, rotated)
     imgray[y:y + h, x:x + w] = intersection
     # get area of intersection (black means no homologues found)
     intersectionArea = np.sum(intersection != 0)
     noHomologueArea = np.sum(intersection == 0)
     # lesion area
     lesionArea = cv2.contourArea(contour)
     # asymmetry
     asymmetry = (noHomologueArea / lesionArea) * 100
     asymmetry = round(asymmetry, 2)
     return asymmetry
 def inflammationAndBloodness(img, contour):
     '''
         returns the inflammation and bloodness factor (presence of red colors)
     '''
     # remove artifact
     img = Preprocess.removeArtifactYUV(img)
     # extract the lesion
     lesion = Caracteristics.extractLesion(img, contour)
     # lesion area
     lesionArea = cv2.contourArea(contour)
     # get bounding rect
     x, y, w, h = cv2.boundingRect(contour)
     # crop the rect
     lesion = lesion[y:y + h, x:x + w]
     # convert to HSV
     lesionHSV = cv2.cvtColor(lesion, cv2.COLOR_BGR2HSV)
     # set color intervals
     redH1 = np.array([0, 100, 220], dtype=np.uint8)
     redL1 = np.array([10, 125, 253], dtype=np.uint8)
     redH2 = np.array([160, 130, 100], dtype=np.uint8)
     redL2 = np.array([180, 255, 253], dtype=np.uint8)
     intervalsL = [redH1, redH2]
     intervalsH = [redL1, redL2]
     # check colors
     nbColors = 0
     # seuil ( percentage of colr area compared with the total lesion's area)
     seuil = 1
     for i in range(0, len(intervalsH)):
         L = intervalsL[i]
         H = intervalsH[i]
         mask = cv2.inRange(lesionHSV, L, H)
         n = np.sum(mask != 0) / lesionArea * 100
         if n > seuil:
             nbColors += 1
     return nbColors
Example #28
0
def loadScoreTs(stock, wordCounterTsFilename=TS_FILENAME):
	if wordCounterTsFilename not in wordCounters:
		wordCounters[wordCounterTsFilename] = Preprocess.loadTs(wordCounterTsFilename)
	wordCounterTs = wordCounters[wordCounterTsFilename]

	topicWordCountTs = wordCounterTs.getTopicTs(stock)
	return sent.getScoreTimeseries(topicWordCountTs)
Example #29
0
 def colorHSVIntervals(img, contour):
     '''
         returns the number of colors in a lesion by assigning colors to HSV intervals
     '''
     # remove artifact
     img = Preprocess.removeArtifactYUV(img)
     # extract the lesion
     lesion = Caracteristics.extractLesion(img, contour)
     # lesion area
     lesionArea = cv2.contourArea(contour)
     # get bounding rect
     x, y, w, h = cv2.boundingRect(contour)
     # crop the rect
     lesion = lesion[y:y + h, x:x + w]
     # convert to HSV
     lesionHSV = cv2.cvtColor(lesion, cv2.COLOR_BGR2HSV)
     # set color intervals
     whiteH = np.array([0, 0, 254], dtype=np.uint8)
     whiteL = np.array([180, 250, 255], dtype=np.uint8)
     blackH = np.array([0, 0, 1], dtype=np.uint8)
     blackL = np.array([180, 120, 150], dtype=np.uint8)
     redH1 = np.array([0, 100, 220], dtype=np.uint8)
     redL1 = np.array([10, 125, 253], dtype=np.uint8)
     redH2 = np.array([160, 130, 100], dtype=np.uint8)
     redL2 = np.array([180, 255, 253], dtype=np.uint8)
     darkBrownH1 = np.array([0, 30, 140], dtype=np.uint8)
     darkBrownL1 = np.array([10, 120, 253], dtype=np.uint8)
     darkBrownH2 = np.array([0, 130, 120], dtype=np.uint8)
     darkBrownL2 = np.array([10, 255, 253], dtype=np.uint8)
     lightBrownH1 = np.array([11, 50, 140], dtype=np.uint8)
     lightBrownL1 = np.array([21, 255, 253], dtype=np.uint8)
     lightBrownH2 = np.array([160, 30, 170], dtype=np.uint8)
     lightBrownL2 = np.array([180, 100, 253], dtype=np.uint8)
     lightBrownH3 = np.array([11, 120, 100], dtype=np.uint8)
     lightBrownL3 = np.array([21, 255, 253], dtype=np.uint8)
     blueGrayH1 = np.array([120, 10, 150], dtype=np.uint8)
     blueGrayL1 = np.array([180, 120, 170], dtype=np.uint8)
     blueGrayH2 = np.array([0, 120, 100], dtype=np.uint8)
     blueGrayL2 = np.array([10, 130, 190], dtype=np.uint8)
     intervalsL = [
         whiteH, blackH, redH1, redH2, darkBrownH1, darkBrownH2,
         lightBrownH1, lightBrownH2, lightBrownH3, blueGrayH1, blueGrayH2
     ]
     intervalsH = [
         whiteL, blackL, redL1, redL2, darkBrownL1, darkBrownL2,
         lightBrownL1, lightBrownL2, lightBrownL3, blueGrayL1, blueGrayL2
     ]
     # check colors
     nbColors = 0
     # seuil ( percentage of colr area compared with the total lesion's area)
     seuil = 6
     for i in range(0, len(intervalsH) - 1):
         L = intervalsL[i]
         H = intervalsH[i]
         mask = cv2.inRange(lesionHSV, L, H)
         n = np.sum(mask != 0) / lesionArea * 100
         if n > seuil:
             nbColors += 1
     return nbColors
 def asymmetryBySubRegionCentered(img, contour):
     '''
         get asymmetry by dividing the lesion to 4 subregions
         but the lesion is placed in the center of img
     '''
     # remove artifact
     img = Preprocess.removeArtifactYUV(img)
     # convert img to gray
     img = Caracteristics.extractLesion(img, contour)
     imgray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
     # binarize the img
     # imgray[imgray > 0] = 255
     # find best fit ellipse
     (_, _), (_, _), angle = cv2.fitEllipse(contour)
     # get bounding rect
     x, y, w, h = cv2.boundingRect(contour)
     # get moments of contour
     M = cv2.moments(contour)
     # center of gravity of the lesion
     xe = int(M["m10"] / M["m00"])
     ye = int(M["m01"] / M["m00"])
     # get the centered rect
     cx = x + w // 2
     cy = y + h // 2
     deltaX1 = abs(int(xe - cx))
     deltaY1 = abs(int(ye - cy))
     x1 = x + deltaX1
     w1 = w + deltaX1
     y1 = y + deltaY1
     h1 = h + deltaY1
     padding = 0
     # crop the rect
     rect = imgray[y1 - padding:y1 + h1 + padding,
                   x1 - padding:x1 + w1 + padding]
     # rotate the lesion according to its best fit ellipse
     rect = ndimage.rotate(rect, angle, reshape=False)
     # flip H, flip V, flip VH
     rectH = cv2.flip(rect, 0)
     rectV = cv2.flip(rect, 1)
     rectVH = cv2.flip(rect, -1)
     # lesion area
     lesionArea = cv2.contourArea(contour)
     # intersect rect and rectH
     intersection1 = cv2.bitwise_and(rect, rectH)
     intersectionArea1 = np.sum(intersection1 != 0)
     result1 = (intersectionArea1 / lesionArea) * 100
     # intersect rect and rectV
     intersection2 = cv2.bitwise_and(rect, rectV)
     intersectionArea2 = np.sum(intersection2 != 0)
     result2 = (intersectionArea2 / lesionArea) * 100
     # intersect rect and rectVH
     intersection3 = cv2.bitwise_and(rect, rectVH)
     intersectionArea3 = np.sum(intersection3 != 0)
     result3 = (intersectionArea3 / lesionArea) * 100
     res = [result1, result2, result3]
     asymmetry = max(res)
     asymmetry = 100 - asymmetry
     asymmetry = round(asymmetry, 2)
     return asymmetry
 def __init__(self, version):
     self.version, self.classifier = version, CLASSIFIERS[CLASSIFIER]
     _header(self.version)
     (
         self.raw_test_samples,
         self.raw_train_samples,
         self.raw_test_names,
     ) = Preprocess._parseAllRawApks()
Example #32
0
	def __init__ (self):
		"""
			PUBLIC: Constructor
			-------------------
			constructs member objects
		"""
		#=====[ Step 1: create member objects	]=====
		self.preprocess = Preprocess ()
		self.storage_delegate = StorageDelegate ()
		self.semantic_analysis = SemanticAnalysis ()
		self.user_analysis = UserAnalysis ()
		self.inference = None
Example #33
0
class SpotOn:

	def __init__ (self):
		"""
			PUBLIC: Constructor
			-------------------
			constructs member objects
		"""
		#=====[ Step 1: create member objects	]=====
		self.preprocess = Preprocess ()
		self.storage_delegate = StorageDelegate ()
		self.semantic_analysis = SemanticAnalysis ()
		self.user_analysis = UserAnalysis ()
		self.inference = None


	def load (self):
		"""
			PUBLIC: load 
			------------
			loads in all parameters 
		"""
		#=====[ Step 1: load in semantic analysis	]=====
		print_status ("Initialization", "Loading ML parameters (Begin)")
		self.semantic_analysis.load ()
		print_status ("Initialization", "Loading ML parameters (End)")		

		#=====[ Step 2: transfer over models to inference	]=====
		print_status ("Initialization", "Constructing Inference instance (Begin)")
		self.inference = Inference (self.semantic_analysis.lda_model, self.semantic_analysis.lda_model_topics)
		print_status ("Initialization", "Constructing Inference instance (End)")







	####################################################################################################
	######################[ --- Getting Users --- ]#####################################################
	####################################################################################################

	def get_users (self):
		"""
			PUBLIC: get_users
			-----------------
			constructs self.u_df from all available 
			calendar dataframes 
		"""
		self.u_df = self.user_analysis.extract_users (self.storage_delegate.iter_calendar_dfs)
		self.u_df = self.semantic_analysis.analyze (self.u_df, 'all_event_names')


	def load_users (self, filepath='../data/pandas/users/users.df'):
		"""
			PUBLIC: load_users
			------------------
			constructs self.u_df from a saved file
		"""
		self.u_df = pd.read_pickle(filepath)

	









	####################################################################################################
	######################[ --- Training  --- ]#########################################################
	####################################################################################################

	def extract_text (self, activity_row):
		"""
			PRIVATE: extract_text
			---------------------
			given a row representing an activity, this returns 
			a list of words representing it as a 'text'
		"""
		text = []
		if type(activity_row['name']) == list:
			text += activity_row['name']
		if type(activity_row['words']) == list:
			text += activity_row['words']
		return text

	def get_corpus_dictionary (self):
		"""
			PRIVATE: get_corpus_dictionary
			------------------------------
			Assembles a gensim corpus and dictionary from activities_df,
			where each text is name || words.
		"""
		#=====[ Step 1: iterate through all activity dataframes	]=====
		print_status ("get_corpus", "assembling texts")
		texts = []
		for df in self.storage_delegate.iter_activity_dfs ():
			print_inner_status ("assembling texts", "next df")
			texts += list(df.apply(self.extract_text, axis=1))

		#=====[ Step 3: get dictionary	]=====
		print_status ("get_corpus", "assembling dictionary")
		dictionary = gensim.corpora.Dictionary(texts)

		#=====[ Step 4: get corpus	]=====
		print_status ("get_corpus", "assembling corpus")		
		corpus = [dictionary.doc2bow (text) for text in texts]

		return corpus, dictionary



	def train_semantic_analysis (self):
		"""
			PUBLIC: train_semantic_analysis
			-------------------------------
			finds parameters for self.semantic_analysis
		"""
		#=====[ Step 1: get the corpus	]=====
		print_status ("train_semantic_analysis", "getting corpus/dictionary")
		corpus, dictionary = self.get_corpus_dictionary ()

		#=====[ Step 2: train ]=====
		print_status ("train_semantic_analysis", "training semantic analysis")
		self.semantic_analysis.train (corpus, dictionary)






	####################################################################################################
	######################[ --- Inference --- ]#########################################################
	####################################################################################################

	def score_activities_old (self, user_activities, recommend_activities):
		"""
			PUBLIC: score_activities
			------------------------
			Given a user and a list of activities, both represented as json, this will return 
			(activities, scores) in a sorted list
		"""
		#=====[ Step 1: preprocess json inputs	]=====
		user_events_df = self.preprocess.preprocess_a (user_activities)
		activities_df = self.preprocess.preprocess_a (recommend_activities)

		#=====[ Step 2: construct a user from user_events_df	]=====
		def f():
			yield user_events_df
		users = self.user_analysis.extract_users (f)
		assert len(users) == 1
		user = users.iloc[0]

		#=====[ Step 3: get scores for each one	]=====
		scores = [inference.score_match (user, activities_df.iloc[i]) for i in range(len(activities_df))]

		#=====[ Step 4: return sorted list of activity, score	]=====
		return sorted(zip(activities_json, scores), key=lambda x: x[1], reverse=True)


	def score_activities (self, user_activities, recommend_activities):
		"""
			PUBLIC: score_activities
			------------------------
			Given a user and a list of activities, both represented as json, this will return 
			(activities, scores) in a sorted list
		"""
		#=====[ Step 1: preprocess user_activities and recommend_activities	]=====
		user_activities = self.preprocess.preprocess_a (user_activities)
		# print len(recommend_activities)
		recommend_activities = self.preprocess.preprocess_a (recommend_activities)
		# print len(recommend_activities)

		#=====[ Step 2: get scores for each one	]=====
		scores,act = self.inference.score_activities (user_activities, recommend_activities)
		return scores,act


	####################################################################################################
	######################[ --- Interface --- ]#########################################################
	####################################################################################################

	def print_lda_topics (self):
		"""
			PUBLIC: print_lda_topics
			------------------------
			prints out a representation of the lda topics found in self.semantic_analysis
		"""
		self.semantic_analysis.print_lda_topics ()
Example #34
0
        N_val = 0

    N_tst = -1

    print 'Load training test'
    X, y = load(N_trn+N_val)
    if N_trn == -1 or X.shape[0] < N_trn+N_val:
        N_trn = X.shape[0]

    print 'Load test set'
    X_tst, _ = load(N_tst, tst=True)
    N_tst = X_tst.shape[0]

    print 'Preprocess the data'
    from Preprocess import Preprocess
    prep_model = Preprocess(strategy_mv='median')
    X = prep_model.fit_transform(np.concatenate([X, X_tst]))

    # Get back the training, validation and test set
    X_trn = X[:N_trn]
    y_trn = y[:N_trn]

    X_val = X[N_trn:N_trn+N_val]
    y_val = y[N_trn:]
    X_tst = X[-N_tst:]

    from sklearn.ensemble import RandomForestClassifier

    model = RandomForestClassifier(n_jobs=6)

    if args.CV:
	def __init__(self,version):
		self.version,self.classifier    			   = version,CLASSIFIERS[CLASSIFIER]
		_header(self.version)
		(self.raw_test_samples,self.raw_train_samples,self.raw_test_names,) = Preprocess._parseAllRawApks()
Example #36
0
class SpotOn:

	def __init__ (self):
		"""
			PUBLIC: Constructor
			-------------------
			constructs member objects
		"""
		#=====[ Step 1: create member objects	]=====
		self.preprocess = Preprocess ()
		self.storage_delegate = StorageDelegate ()
		self.semantic_analysis = SemanticAnalysis ()
		self.user_analysis = UserAnalysis ()
		self.inference = None
		self.activities_corpus = None


	def load (self):
		"""
			PUBLIC: load 
			------------
			loads in all parameters 
		"""
		#=====[ Step 1: load in semantic analysis	]=====
		print_status ("Initialization", "Loading ML parameters (Begin)")
		self.semantic_analysis.load ()
		print_status ("Initialization", "Loading ML parameters (End)")		

		#=====[ Step 2: transfer over models to inference	]=====
		print_status ("Initialization", "Constructing Inference instance (Begin)")
		self.inference = Inference (self.semantic_analysis.lda_model, self.semantic_analysis.lda_model_topics)
		print_status ("Initialization", "Constructing Inference instance (End)")







	####################################################################################################
	######################[ --- Getting Users --- ]#####################################################
	####################################################################################################

	def get_users (self):
		"""
			PUBLIC: get_users
			-----------------
			constructs self.u_df from all available 
			calendar dataframes 
		"""
		self.u_df = self.user_analysis.extract_users (self.storage_delegate.iter_calendar_dfs)
		self.u_df = self.semantic_analysis.analyze (self.u_df, 'all_event_names')


	def load_users (self, filepath='../data/pandas/users/users.df'):
		"""
			PUBLIC: load_users
			------------------
			constructs self.u_df from a saved file
		"""
		self.u_df = pd.read_pickle(filepath)











	####################################################################################################
	######################[ --- Training  --- ]#########################################################
	####################################################################################################

	def extract_text (self, activity_row):
		"""
			PRIVATE: extract_text
			---------------------
			given a row representing an activity, this returns 
			a list of words representing it as a 'text'
		"""
		text = []
		if type(activity_row['name']) == list:
			text += activity_row['name']
		if type(activity_row['words']) == list:
			text += activity_row['words']
		return text


	def get_corpus_dictionary (self):
		"""
			PRIVATE: get_corpus_dictionary
			------------------------------
			Assembles a gensim corpus and dictionary from activities_df,
			where each text is name || words.
		"""
		#=====[ Step 1: iterate through all activity dataframes	]=====
		print_status ("get_corpus", "assembling texts")
		documents = []
		for df in self.storage_delegate.iter_activity_dfs ():
			df['lda_doc'] = df['name'] + df['words']
			documents += list(df['lda_doc'])

		#=====[ Step 2: get dictionary	]=====
		print_status ("get_corpus", "assembling dictionary")
		dictionary = gensim.corpora.Dictionary(documents)

		#=====[ Step 3: get corpus	]=====
		print_status ("get_corpus", "assembling corpus")		
		corpus = [dictionary.doc2bow (d) for d in documents]

		return corpus, dictionary


	def print_lda_topics (self):
		"""
			PUBLIC: print_lda_topics
			------------------------
			prints out a representation of the lda topics found in self.semantic_analysis
		"""
		print_header ("LDA TOPICS: ")
		self.semantic_analysis.print_lda_topics ()


	def train_semantic_analysis (self):
		"""
			PUBLIC: train_semantic_analysis
			-------------------------------
			finds parameters for self.semantic_analysis
		"""
		#=====[ Step 1: get the corpus	]=====
		print_status ("train_semantic_analysis", "getting corpus/dictionary")
		corpus, dictionary = self.get_corpus_dictionary ()

		#=====[ Step 2: train ]=====
		print_status ("train_semantic_analysis", "training semantic analysis")
		self.semantic_analysis.train (corpus, dictionary)

		#####[ DEBUG: print out lda topics	]#####
		self.print_lda_topics ()




	####################################################################################################
	######################[ --- Processing --- ]########################################################
	####################################################################################################

	def activities_json_to_df (self, a_json):
		"""
			PRIVATE: activities_json_to_df
			------------------------------
			given: list of json dicts representing activities 
			returns: dataframe with preprocessing, semantic analysis
		"""
		a_df = self.preprocess.preprocess_a (a_json)
		a_df = self.semantic_analysis.add_lda_vec_column (a_df)
		a_df = self.semantic_analysis.add_w2v_sum_column (a_df)
		return a_df


	def calendar_events_json_to_df (self, ce_json):
		"""
			PRIVATE: calendar_events_json_to_df
			------------------------------
			given: list of json dicts representing calendar events 
			returns: dataframe with preprocessing, semantic analysis
		"""
		ce_df = self.preprocess.preprocess_ce (ce_json)
		ce_df = self.semantic_analysis.add_lda_vec_column (ce_df)
		ce_df = self.semantic_analysis.add_w2v_sum_column (ce_df)
		return ce_df


	def calendar_events_to_user_representation(self, ce_json):
		"""
			PUBLIC: calendar_events_to_user_representation
			----------------------------------------------
			given a list containing json dicts representing calendar events belonging
			to a single user, this will return a representation that can be passed to 
			score_activity_for_user and recommend_for_user
		"""
		user_df 	= self.calendar_events_json_to_df (ce_json)
		lda_vec 	= self.semantic_analysis.get_user_lda_vec (user_df)
		return {'events_df':user_df, 'lda_vec':lda_vec}


	def load_activities_corpus(self, activities):
		'''
			function: load_activities_corpus
			params: activities - list of activities to recommend

			returns: none
			notes: use this function to load a big activities corpus into the SpotOn object, and later when calling
			recommend_for_user we will pull activities to recommend from this corpus.

			Can be called multiple times to update to different activities
		'''
		self.activities_corpus = self.activities_json_to_df (activities)












	####################################################################################################
	######################[ --- Recommending --- ]######################################################
	####################################################################################################

	def score_activity_for_user(self, user_representation, activity):
		"""
			PUBLIC: score_activity_for_user
			-------------------------------
			params: user_representation - representation of the user to score for
								(created by calendar_events_to_user_representation)
					activity - json of the activity to score

			notes: goes from the representation of the user that you use + one activity 
					-> return a score for how much they'd like it
		"""
		#=====[ Step 1: get activity dataframe 	]=====
		activity_df = self.activities_json_to_df ([activity])

		#=====[ Step 2: get scored dataframe	]=====
		activity_df = self.inference.infer_scores (user_representation, activity_df)

		#=====[ Step 3: extract and return score	]=====
		return activity_df.iloc[0]['score']


	def recommend_for_user(self, user_representation, activities=None, topn=10):
		"""
			PUBLIC: recommend_for_user
			--------------------------
			params: user_representation - representation of the user to recommend for
					activities - either a list of json activities, or None if 
									.load_activities_corpus has been called
					topn - number of recommendations to return
		"""
		#=====[ Step 1: get a_df, df of activities to recommend	]=====
		if activities is not None:
			activities_df = self.activities_json_to_df (activities)
		else:
			if not (self.activities_corpus is not None):
				self.load_activities_corpus ()
			activities_df = self.activities_corpus

		#=====[ Step 2: get scores, return sorted	]=====
		activity_ranks = self.inference.rank_activities (user_representation, activities_df)
		return list(activity_ranks)


	def recommend_users_for_activity(self, activity, list_of_users, topn=10):
		"""
			PUBLIC: recommend_users_for_activities
			--------------------------------------
			params: activity - activity to recommend users for
					list_of_users - list of users to filter
					topn - number of users to return

			notes: goes from an activity and a list of users -> topn users for that activity
		"""
		scores = [self.score_activity_for_user(user, activity) for user in list_of_users]
		sorted_ix = np.argsort(scores)[::-1]
		return [list_of_users[sorted_ix[i]] for i in range(topn)]
Example #37
0
	def getDayScore(self, wordCount):
		positiveScore = 0
		negativeScore = 0
		for word, count in wordCount.iteritems():
			# print word.encode(ignore), count
			wordDict = self.wl.getWordDict(word)

			if wordDict != None:
				if wordDict['polarity'] == POLARITY['positive']:
					positiveScore += count
				elif wordDict['polarity'] == POLARITY['negative']:
					negativeScore += count

		if negativeScore == 0:
			return float(positiveScore)

		return float(positiveScore) / negativeScore

	# Return the score timeseries as a list
	def getScoreTimeseries(self, wordCounterTs):
		return wordCounterTs.mapValues(self.getDayScore)


if __name__ == '__main__':
	ts = Preprocess.loadTs()

	sent = SentimentAnalysis()
	t = time.clock()
	a = sent.getScoreTimeseries(TimeSeries(ts.getTopicTs('msft')))
	print time.clock() - t
Example #38
0
# Script: preprocess_json
# -----------------------
# script to convert raw json of activities to 
# a large pandas dataframe. call as 'ipython -i preprocess_json.py'
# if you want to pickle or play with the resulting
# dataframe afterwards
import pickle
import json
import time
from Preprocess import Preprocess
from util import *

if __name__ == "__main__":
	print_header ("PREPROCESS JSON - convert raw json to dataframe")
	start_time = time.time ()

	#=====[ Step 1: create preprocess	]=====
	pre = Preprocess ()

	#=====[ Step 2: load json into memory	]=====
	print_status ("preprocess_json", "loading json into memory")
	json_filename = '/Users/jayhack/Dropbox/Data/Spoton/activities_0_till_168694.json'
	a_json = json.load (open(json_filename, 'r'))

	#=====[ Step 3: apply Preprocess to it	]=====
	print_header ('PREPROCESSING JSON')
	a_df = pre.preprocess_a (a_json)

	######[ PRINT ELAPSED TIME	]#####
	end_time = time.time ()
	print_notification ('Elapsed time: ' + str(end_time - start_time))