def form(): file_path = "" # load the model cnn = keras.models.load_model('dr.h5') try: # get the image if request.method == "POST": image = request.files['image'] img_name = image.filename file_path = os.path.join('./static/uploaded_images', img_name) image.save(file_path) # preprocess the image to make it similar # to training data a = Preprocess() a.preprocess(file_path) # image is converted to grayscale # and then to numpy array image = Image.open('./static/uploaded_images/preprocessed.jpeg') image = ImageOps.grayscale(image) img_arr = img.img_to_array(image) img_arr = img_arr.astype("float32") img_arr = img_arr / 255.0 img_arr = np.expand_dims(img_arr, axis=0) # prediction predict = cnn.predict(img_arr) pred = np.argmax(predict[0]) os.remove(file_path) os.remove('./static/uploaded_images/preprocessed.jpeg') return render_template("index.html", image_name=pred) else: return render_template("index.html", image_name="None") except: return render_template("index.html", image_name="No proper image file selected")
def __init__(self, p, q, lag): self.preprocess = Preprocess(lag=lag) self.p = p # order of residual term self.q = q # order of variance term self.omega = np.array([]) self.alpha = np.empty(shape=(0, self.p)) # residual term parameter self.beta = np.empty(shape=(0, self.q)) # variance term parameter
def classify(self,img_mdf): """ Execute the model of neuronal network Returns if the image is an ostrich. Args: img_mdf(numpy array). Array of image data Returns: True. If the max value of precitions is ostrich """ if not os.path.exists(os.path.join(h5_path,"ostrich.h5")): print("Invalid path") return model = keras.models.load_model(os.path.join(h5_path,"ostrich.h5")) preprocess = Preprocess() img_mdf = preprocess.prepare_image(img_mdf,60) predictions = model.predict(img_mdf) max = np.argmax(predictions) categories = ["ostrich","not an ostrich"] result = categories[max] if (result == "ostrich"): return True else: return False
def preprocessData(): preprocessing = Preprocess(data="fundamental_ratios") print("retrieving fundamental ratios...") fr_train, fr_validate = preprocessing.get_data(dataType="scaled", dset="train_validate") print("retrieving returns...") ar = preprocessing.retrieve_return() print("split returns...") ar_train = ar[ar.index.isin(fr_train.index)] ar_validate = ar[ar.index.isin(fr_validate.index)] print("trim fundamental ratios...") fr_train = fr_train[fr_train.index.isin(ar_train.index)] fr_validate = fr_validate[fr_validate.index.isin(ar_validate.index)] # remove boundary values print(ar_train) ar_train.drop(ar_train.nlargest(250, "return").index, axis=0, inplace=True) ar_train.drop(ar_train.nsmallest(250, "return").index, axis=0, inplace=True) # re-order train set for visualization ar_train = ar_train.sort_values("return") fr_train = fr_train.loc[ar_train.index] train = (fr_train, ar_train) # re-order validation set for visualization ar_validate = ar_validate.sort_values("return") fr_validate = fr_validate.loc[ar_validate.index] validate = (fr_validate, ar_validate) return train, validate
def test(): import pandas as pd import numpy as np from Preprocess import Preprocess train = pd.read_csv('~/Downloads/ds-project-train.csv', dtype={ 'SHIPPER.ADDRESS': np.str, 'ZIPCODE': np.str }, parse_dates=['ARRIVAL.DATE']) test = pd.read_csv('~/Downloads/ds-project-test.csv', dtype={ 'SHIPPER.ADDRESS': np.str, 'ZIPCODE': np.str }, parse_dates=['ARRIVAL.DATE']) p = Preprocess() X_train = p.run(df=train) X_test = p.run(df=test, test=True) y_train = X_train['COUNTRY.OF.ORIGIN'] X_train = X_train.drop(['COUNTRY.OF.ORIGIN'], axis=1) y_test = X_test['COUNTRY.OF.ORIGIN'] X_test = X_test.drop(['COUNTRY.OF.ORIGIN'], axis=1) fe = FeatureEngineering() X_train = fe.run(df=X_train) X_test = fe.run(df=X_test, test=True) print('!')
def classify(self, img_test): """ Execute the model of neuronal network and return true if the image is a guinea pig. Args: img_mdf(numpy array). Array of image data Returns: True. If the max value of precitions is guinea pig """ save_path = os.path.join(h5_path, "guineapig.h5") preprocess = Preprocess() img = preprocess.prepare_image(img_test, 60) if not os.path.exists(save_path): raise FileExistsError("File guineapig.h5 was deleted. :(") model = keras.models.load_model(save_path) predictions = model.predict(img) max = np.argmax(predictions) categories = ["guinea pig", "not guinea pig"] result = categories[max] if (result == "guinea pig"): return True else: return False
class InformationRatio: def __init__(self, benchmark="snp500"): self.benchmark = benchmark self.preprocess = Preprocess() self.alpha = None self.index = None def computeInformationRatio(self, portfolio): returns = self.preprocess.retrieve_return() index = self.preprocess.retrieve_benchmark_change("snp500") if self.alpha is None: alpha = post.compute_alpha( index, returns).loc[portfolio.keys()]["alpha"].values self.alpha = alpha else: alpha = self.alpha #print("alpha:", alpha) weight = np.array(list(portfolio.values())) #print("weight", weight) portfolio_return = np.sum(np.multiply(alpha, weight)) #print("portfolio return", portfolio_return) volatility = np.std(alpha) #print("volatility", volatility) if self.index is None: index = self.preprocess.retrieve_benchmark_change( self.benchmark) - 1 self.index = index else: index = self.index #print("benchmark", index) information_ratio = (portfolio_return - index) / volatility return information_ratio
def test_retrieve_mkt_caps(self): self.preprocess = Preprocess(lag=7) try: df = self.preprocess.retrieve_mkt_caps(["GE", "MMM", "APPL"]) if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_dividends(self): self.preprocess = Preprocess(lag=7) try: df = self.preprocess.retrieve_dividends() if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_fundamental_ratios(self): self.preprocess = Preprocess() try: df = self.preprocess.retrieve_fundamental_ratios() if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_benchmark(self): self.preprocess = Preprocess(lag=30) try: df = self.preprocess.retrieve_benchmark("snp500") if not isinstance(df, pd.DataFrame) and not df.empty: raise Exception except: self.fail()
def test_retrieve_benchmark_change(self): self.preprocess = Preprocess(lag=7) try: change = self.preprocess.retrieve_benchmark_change("snp500") if not isinstance(change, float): raise Exception except: self.fail()
def __init__(self, asset, risk_free=0): self.preprocess = Preprocess() self.asset = asset self.risk_free = risk_free self.covariance = None # type: pd.DataFrame self.mean = None #pd.Series(index=asset) self.max_sharpe_comp = None # maximum sharpe portfolio composition self.min_vol_comp = None # minimum volatility portfolio composition
class OptionPair: """constructor """ def __init__(self): self.preprocess = Preprocess(lag=70) """compute_correlation Description: compute correlation between all stocks """ def compute_correlation(self): daily_price = self.preprocess.retrieve_open_close() daily_change = post.compute_daily_change(daily_price) return daily_change.corr(method='pearson', min_periods=30) """find_movement_pairs Description: find stock pairs with high daily movement correlation Input: corr: correlation matrix of all stocks threshold: correlation coefficient threshold """ @staticmethod def find_movement_pairs(corr, threshold=0.95): pairs = [] for symbol in corr: for (i, v) in corr[symbol].iteritems(): if i == symbol: continue else: if abs(v) > threshold: pairs.append((symbol, i, v)) return pairs """narrow_growth_pairs Description: Even highly correlated daily movement pair will produce long-term growth drift, this method narrows the correlation pairs to those that have similar growth over time. Input: pairs: best daily movement correlation pairs threshold: growth drift threshold """ def narrow_growth_pairs(self, pairs, threshold=0.05): returns = self.preprocess.retrieve_return() for pair in pairs: try: r1 = returns.loc[pair[0], "return"] r2 = returns.loc[pair[1], "return"] drift = abs(r1 - r2) / abs((r1 + r2) / 2) except KeyError: # remove pair if return cannot be validated drift = 1 if drift > threshold: pairs.remove(pair) return pairs
def test_scale_data(self): self.preprocess = Preprocess(lag=7) data = [('x', [1, 2, 3, 4]), ('y', [51, -6, 43, -8])] df = pd.DataFrame.from_items(data) scaled = self.preprocess.scale_data(df) self.assertTrue(scaled['x'].max() <= 1) self.assertTrue(scaled['y'].max() <= 1) self.assertTrue(scaled['x'].min() >= 0) self.assertTrue(scaled['y'].min() >= 0)
def test_retrieve_return(self): self.preprocess = Preprocess(lag=7) try: df1 = self.preprocess.retrieve_return( ) # non-split is a super set of split returns if not isinstance(df1, pd.DataFrame) and not df1.empty: raise Exception except: self.fail()
def __init__(self, topics_filename, dir_name): self.parser = TopicsParser() self.preProcess = Preprocess() self.preprocessed = True self.topics_parsed = self.parser.get_data(topics_filename) self.topics = dict() for topic in self.topics_parsed: self.topics[topic['num']] = " ".join( self.preProcess.preprocess(topic['title'] + ' ' + topic['narr'] + ' ' + topic['desc']))
def test_filter_column(self): self.preprocess = Preprocess(density=0.5, lag=7) data = [('symbol', ['A', 'B', 'C', 'D']), ('index', [150, 200, 50, 10]), ('date', [200, 210, 90, 20]), ('currency', [140, 215, 95, 30]), ('latestadate', [140, 215, 95, 40]), ('dense', [140, 215, np.NaN, 50]), ('sparse', [np.NaN, np.NaN, np.NaN, 60])] df = pd.DataFrame.from_items(data) filtered = self.preprocess.filter_column(df) self.assertEqual(len(filtered.columns), 2) # only symbol and dense will survive
def __init__(self, load, dir_name, files): self.preProcess = Preprocess() self.documentParser = DocumentParser() self.preprocessed = True if not load: if not os.path.isdir(dir_name): os.mkdir(dir_name) schema = Schema(id=TEXT(stored=True), content=TEXT(stored=True)) self.ix = create_in(dir_name, schema) self.index(files) else: self.ix = open_dir(dir_name)
def __init__(self, df = None, continuous_features=[], unordered_categorical_features = [], ordered_categorical_features = []): self.data = df self.continuous_features = continuous_features self.unordered_categorical_features = unordered_categorical_features self.ordered_categorical_features = ordered_categorical_features self.prprcs = Preprocess() self.fs = FieldStatistics() self.trnsfrmr = Transformer() self.imptr = ImputeData() self.pltr = Plotting()
def main(): ''' Main function to preprocess data. This function uses the Preprocess class. ''' dataDirectory = './Data/ToProcessData' preprocessor = Preprocess(dataDirectory) print('Object created') st = time.time() preprocessor.PreprocessData() # preprocessor.PrepareTrainTestSet() print('Preprocess data execution ended') en = time.time() print('Time taken to process data = ', en - st, ' sec')
def contours2(img): # equalize Y channel hist # img = Preprocess.equalizeHistYChannel(img) # remove artifacts # img = Preprocess.removeArtifact(img) # remove RGB artifact img = Preprocess.removeArtifactYUV(img) # apply OTSU threshold ret, thresh = Preprocess.OTSUThreshold(img) # search for contours and select the biggest one c, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) cnt = max(contours, key=cv2.contourArea) return cnt
def __init__(self): ''' 初始化 新建对象时默认执行 ''' # ## 窗口 # 主窗口 self.mainWindow = MainWindow() # 信息窗口 self.informationMessageApp = QtWidgets.QWidget() self.informationMessageWindow = MessageWindow() # 服务项 self.preprocess = Preprocess() self.process = Process() self.postprocess = Postprocess() self.attachmentMatch = AttachmentMatch() self.jsonService = JsonService() self.settingJsonService = JsonService('settings.json') # ## 连接 Slots 和 Signals # 快速处理/开始: 按下 --> 快速处理 self.mainWindow.expressProcessButton.pressed.connect( self.expressProcess) # 开始处理/开始: 按下 --> 一般处理 self.mainWindow.generalProcessButton.pressed.connect( self.generalProcess) # 开始处理/附件匹配选择框: 状态变化 --> 附件选择是否生效 self.mainWindow.shouldMatchAttachmentCheckBox.stateChanged.connect( self.shouldEnableAttachmentMatch) # 开始处理/原始数据浏览: 按下 --> 选择原始数据文件 self.mainWindow.generalProcessOriginalDataExploreButton.pressed.connect( self.exploreOriginalDataFile) # 开始处理/附件目录浏览: 按下 --> 选择附件所在目录 self.mainWindow.generalProcessAttachmentLocationExploreButton.pressed.connect( self.exploreAttachmentDirectory) # 开始处理/导出数据浏览: 按下 --> 选择导出数据文件 self.mainWindow.generalProcessExportFileExploreButton.pressed.connect( self.exploreExportDataFile)
def __init__(self, name,topics_filename,relevance_filename, preprocessed): super().__init__(name) self.topicIndex = TopicsIndex('Topic Index', self,topics_filename,relevance_filename, preprocessed) self.documentParser = DocumentParser() self.docLength = 0 self.dictionary={} self.size = 0 self.preProcess=Preprocess() self.evalDocs = self.calcEvaluatedDocs() self.preprocessed = preprocessed self.processingTime = 0 self.indxingTime = 0 self.processingMemory = 0 self.indexingMemory = 0 self.document_lengths=dict()
def colorSDG(img, contour): ''' calculate the Standard Deviation Grayscale ''' # remove artifact img = Preprocess.removeArtifactYUV(img) # extract the lesion lesion = Caracteristics.extractLesion(img, contour) # convert img to gray lesion = cv2.cvtColor(lesion, cv2.COLOR_RGB2GRAY) # get bounding rect x, y, w, h = cv2.boundingRect(contour) # crop the rect lesion = lesion[y:y + h, x:x + w] # lesion area lesionArea = cv2.contourArea(contour) # sum of pixels s = np.sum(lesion) # get mean color value mean = s // lesionArea # calculate SDG lesion[lesion != 0] = np.subtract(lesion[lesion != 0], mean) lesion = np.power(lesion, 2) SDG = np.sum(lesion) # SDG = 0 # for i in range(0, h): # for j in range(0, w): # if lesion[i, j] != 0: # SDG = SDG + ((lesion[i, j] - mean)**2) SDG = np.sqrt((1 / lesionArea) * SDG) SDG = round(SDG, 2) return SDG
def asymmetryIndex(img, contour): ''' get asymmetry index search for homologue of each pixel ''' # remove artifact img = Preprocess.removeArtifactYUV(img) # convert img to gray imgray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) # get bounding rect x, y, w, h = cv2.boundingRect(contour) # crop the rect rect = imgray[y:y + h, x:x + w] # rotate 180° rotated = imutils.rotate_bound(rect, 180) # intersection between rect and rotated (search) intersection = cv2.bitwise_and(rect, rotated) imgray[y:y + h, x:x + w] = intersection # get area of intersection (black means no homologues found) intersectionArea = np.sum(intersection != 0) noHomologueArea = np.sum(intersection == 0) # lesion area lesionArea = cv2.contourArea(contour) # asymmetry asymmetry = (noHomologueArea / lesionArea) * 100 asymmetry = round(asymmetry, 2) return asymmetry
def inflammationAndBloodness(img, contour): ''' returns the inflammation and bloodness factor (presence of red colors) ''' # remove artifact img = Preprocess.removeArtifactYUV(img) # extract the lesion lesion = Caracteristics.extractLesion(img, contour) # lesion area lesionArea = cv2.contourArea(contour) # get bounding rect x, y, w, h = cv2.boundingRect(contour) # crop the rect lesion = lesion[y:y + h, x:x + w] # convert to HSV lesionHSV = cv2.cvtColor(lesion, cv2.COLOR_BGR2HSV) # set color intervals redH1 = np.array([0, 100, 220], dtype=np.uint8) redL1 = np.array([10, 125, 253], dtype=np.uint8) redH2 = np.array([160, 130, 100], dtype=np.uint8) redL2 = np.array([180, 255, 253], dtype=np.uint8) intervalsL = [redH1, redH2] intervalsH = [redL1, redL2] # check colors nbColors = 0 # seuil ( percentage of colr area compared with the total lesion's area) seuil = 1 for i in range(0, len(intervalsH)): L = intervalsL[i] H = intervalsH[i] mask = cv2.inRange(lesionHSV, L, H) n = np.sum(mask != 0) / lesionArea * 100 if n > seuil: nbColors += 1 return nbColors
def loadScoreTs(stock, wordCounterTsFilename=TS_FILENAME): if wordCounterTsFilename not in wordCounters: wordCounters[wordCounterTsFilename] = Preprocess.loadTs(wordCounterTsFilename) wordCounterTs = wordCounters[wordCounterTsFilename] topicWordCountTs = wordCounterTs.getTopicTs(stock) return sent.getScoreTimeseries(topicWordCountTs)
def colorHSVIntervals(img, contour): ''' returns the number of colors in a lesion by assigning colors to HSV intervals ''' # remove artifact img = Preprocess.removeArtifactYUV(img) # extract the lesion lesion = Caracteristics.extractLesion(img, contour) # lesion area lesionArea = cv2.contourArea(contour) # get bounding rect x, y, w, h = cv2.boundingRect(contour) # crop the rect lesion = lesion[y:y + h, x:x + w] # convert to HSV lesionHSV = cv2.cvtColor(lesion, cv2.COLOR_BGR2HSV) # set color intervals whiteH = np.array([0, 0, 254], dtype=np.uint8) whiteL = np.array([180, 250, 255], dtype=np.uint8) blackH = np.array([0, 0, 1], dtype=np.uint8) blackL = np.array([180, 120, 150], dtype=np.uint8) redH1 = np.array([0, 100, 220], dtype=np.uint8) redL1 = np.array([10, 125, 253], dtype=np.uint8) redH2 = np.array([160, 130, 100], dtype=np.uint8) redL2 = np.array([180, 255, 253], dtype=np.uint8) darkBrownH1 = np.array([0, 30, 140], dtype=np.uint8) darkBrownL1 = np.array([10, 120, 253], dtype=np.uint8) darkBrownH2 = np.array([0, 130, 120], dtype=np.uint8) darkBrownL2 = np.array([10, 255, 253], dtype=np.uint8) lightBrownH1 = np.array([11, 50, 140], dtype=np.uint8) lightBrownL1 = np.array([21, 255, 253], dtype=np.uint8) lightBrownH2 = np.array([160, 30, 170], dtype=np.uint8) lightBrownL2 = np.array([180, 100, 253], dtype=np.uint8) lightBrownH3 = np.array([11, 120, 100], dtype=np.uint8) lightBrownL3 = np.array([21, 255, 253], dtype=np.uint8) blueGrayH1 = np.array([120, 10, 150], dtype=np.uint8) blueGrayL1 = np.array([180, 120, 170], dtype=np.uint8) blueGrayH2 = np.array([0, 120, 100], dtype=np.uint8) blueGrayL2 = np.array([10, 130, 190], dtype=np.uint8) intervalsL = [ whiteH, blackH, redH1, redH2, darkBrownH1, darkBrownH2, lightBrownH1, lightBrownH2, lightBrownH3, blueGrayH1, blueGrayH2 ] intervalsH = [ whiteL, blackL, redL1, redL2, darkBrownL1, darkBrownL2, lightBrownL1, lightBrownL2, lightBrownL3, blueGrayL1, blueGrayL2 ] # check colors nbColors = 0 # seuil ( percentage of colr area compared with the total lesion's area) seuil = 6 for i in range(0, len(intervalsH) - 1): L = intervalsL[i] H = intervalsH[i] mask = cv2.inRange(lesionHSV, L, H) n = np.sum(mask != 0) / lesionArea * 100 if n > seuil: nbColors += 1 return nbColors
def asymmetryBySubRegionCentered(img, contour): ''' get asymmetry by dividing the lesion to 4 subregions but the lesion is placed in the center of img ''' # remove artifact img = Preprocess.removeArtifactYUV(img) # convert img to gray img = Caracteristics.extractLesion(img, contour) imgray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) # binarize the img # imgray[imgray > 0] = 255 # find best fit ellipse (_, _), (_, _), angle = cv2.fitEllipse(contour) # get bounding rect x, y, w, h = cv2.boundingRect(contour) # get moments of contour M = cv2.moments(contour) # center of gravity of the lesion xe = int(M["m10"] / M["m00"]) ye = int(M["m01"] / M["m00"]) # get the centered rect cx = x + w // 2 cy = y + h // 2 deltaX1 = abs(int(xe - cx)) deltaY1 = abs(int(ye - cy)) x1 = x + deltaX1 w1 = w + deltaX1 y1 = y + deltaY1 h1 = h + deltaY1 padding = 0 # crop the rect rect = imgray[y1 - padding:y1 + h1 + padding, x1 - padding:x1 + w1 + padding] # rotate the lesion according to its best fit ellipse rect = ndimage.rotate(rect, angle, reshape=False) # flip H, flip V, flip VH rectH = cv2.flip(rect, 0) rectV = cv2.flip(rect, 1) rectVH = cv2.flip(rect, -1) # lesion area lesionArea = cv2.contourArea(contour) # intersect rect and rectH intersection1 = cv2.bitwise_and(rect, rectH) intersectionArea1 = np.sum(intersection1 != 0) result1 = (intersectionArea1 / lesionArea) * 100 # intersect rect and rectV intersection2 = cv2.bitwise_and(rect, rectV) intersectionArea2 = np.sum(intersection2 != 0) result2 = (intersectionArea2 / lesionArea) * 100 # intersect rect and rectVH intersection3 = cv2.bitwise_and(rect, rectVH) intersectionArea3 = np.sum(intersection3 != 0) result3 = (intersectionArea3 / lesionArea) * 100 res = [result1, result2, result3] asymmetry = max(res) asymmetry = 100 - asymmetry asymmetry = round(asymmetry, 2) return asymmetry
def __init__(self, version): self.version, self.classifier = version, CLASSIFIERS[CLASSIFIER] _header(self.version) ( self.raw_test_samples, self.raw_train_samples, self.raw_test_names, ) = Preprocess._parseAllRawApks()
def __init__ (self): """ PUBLIC: Constructor ------------------- constructs member objects """ #=====[ Step 1: create member objects ]===== self.preprocess = Preprocess () self.storage_delegate = StorageDelegate () self.semantic_analysis = SemanticAnalysis () self.user_analysis = UserAnalysis () self.inference = None
class SpotOn: def __init__ (self): """ PUBLIC: Constructor ------------------- constructs member objects """ #=====[ Step 1: create member objects ]===== self.preprocess = Preprocess () self.storage_delegate = StorageDelegate () self.semantic_analysis = SemanticAnalysis () self.user_analysis = UserAnalysis () self.inference = None def load (self): """ PUBLIC: load ------------ loads in all parameters """ #=====[ Step 1: load in semantic analysis ]===== print_status ("Initialization", "Loading ML parameters (Begin)") self.semantic_analysis.load () print_status ("Initialization", "Loading ML parameters (End)") #=====[ Step 2: transfer over models to inference ]===== print_status ("Initialization", "Constructing Inference instance (Begin)") self.inference = Inference (self.semantic_analysis.lda_model, self.semantic_analysis.lda_model_topics) print_status ("Initialization", "Constructing Inference instance (End)") #################################################################################################### ######################[ --- Getting Users --- ]##################################################### #################################################################################################### def get_users (self): """ PUBLIC: get_users ----------------- constructs self.u_df from all available calendar dataframes """ self.u_df = self.user_analysis.extract_users (self.storage_delegate.iter_calendar_dfs) self.u_df = self.semantic_analysis.analyze (self.u_df, 'all_event_names') def load_users (self, filepath='../data/pandas/users/users.df'): """ PUBLIC: load_users ------------------ constructs self.u_df from a saved file """ self.u_df = pd.read_pickle(filepath) #################################################################################################### ######################[ --- Training --- ]######################################################### #################################################################################################### def extract_text (self, activity_row): """ PRIVATE: extract_text --------------------- given a row representing an activity, this returns a list of words representing it as a 'text' """ text = [] if type(activity_row['name']) == list: text += activity_row['name'] if type(activity_row['words']) == list: text += activity_row['words'] return text def get_corpus_dictionary (self): """ PRIVATE: get_corpus_dictionary ------------------------------ Assembles a gensim corpus and dictionary from activities_df, where each text is name || words. """ #=====[ Step 1: iterate through all activity dataframes ]===== print_status ("get_corpus", "assembling texts") texts = [] for df in self.storage_delegate.iter_activity_dfs (): print_inner_status ("assembling texts", "next df") texts += list(df.apply(self.extract_text, axis=1)) #=====[ Step 3: get dictionary ]===== print_status ("get_corpus", "assembling dictionary") dictionary = gensim.corpora.Dictionary(texts) #=====[ Step 4: get corpus ]===== print_status ("get_corpus", "assembling corpus") corpus = [dictionary.doc2bow (text) for text in texts] return corpus, dictionary def train_semantic_analysis (self): """ PUBLIC: train_semantic_analysis ------------------------------- finds parameters for self.semantic_analysis """ #=====[ Step 1: get the corpus ]===== print_status ("train_semantic_analysis", "getting corpus/dictionary") corpus, dictionary = self.get_corpus_dictionary () #=====[ Step 2: train ]===== print_status ("train_semantic_analysis", "training semantic analysis") self.semantic_analysis.train (corpus, dictionary) #################################################################################################### ######################[ --- Inference --- ]######################################################### #################################################################################################### def score_activities_old (self, user_activities, recommend_activities): """ PUBLIC: score_activities ------------------------ Given a user and a list of activities, both represented as json, this will return (activities, scores) in a sorted list """ #=====[ Step 1: preprocess json inputs ]===== user_events_df = self.preprocess.preprocess_a (user_activities) activities_df = self.preprocess.preprocess_a (recommend_activities) #=====[ Step 2: construct a user from user_events_df ]===== def f(): yield user_events_df users = self.user_analysis.extract_users (f) assert len(users) == 1 user = users.iloc[0] #=====[ Step 3: get scores for each one ]===== scores = [inference.score_match (user, activities_df.iloc[i]) for i in range(len(activities_df))] #=====[ Step 4: return sorted list of activity, score ]===== return sorted(zip(activities_json, scores), key=lambda x: x[1], reverse=True) def score_activities (self, user_activities, recommend_activities): """ PUBLIC: score_activities ------------------------ Given a user and a list of activities, both represented as json, this will return (activities, scores) in a sorted list """ #=====[ Step 1: preprocess user_activities and recommend_activities ]===== user_activities = self.preprocess.preprocess_a (user_activities) # print len(recommend_activities) recommend_activities = self.preprocess.preprocess_a (recommend_activities) # print len(recommend_activities) #=====[ Step 2: get scores for each one ]===== scores,act = self.inference.score_activities (user_activities, recommend_activities) return scores,act #################################################################################################### ######################[ --- Interface --- ]######################################################### #################################################################################################### def print_lda_topics (self): """ PUBLIC: print_lda_topics ------------------------ prints out a representation of the lda topics found in self.semantic_analysis """ self.semantic_analysis.print_lda_topics ()
N_val = 0 N_tst = -1 print 'Load training test' X, y = load(N_trn+N_val) if N_trn == -1 or X.shape[0] < N_trn+N_val: N_trn = X.shape[0] print 'Load test set' X_tst, _ = load(N_tst, tst=True) N_tst = X_tst.shape[0] print 'Preprocess the data' from Preprocess import Preprocess prep_model = Preprocess(strategy_mv='median') X = prep_model.fit_transform(np.concatenate([X, X_tst])) # Get back the training, validation and test set X_trn = X[:N_trn] y_trn = y[:N_trn] X_val = X[N_trn:N_trn+N_val] y_val = y[N_trn:] X_tst = X[-N_tst:] from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_jobs=6) if args.CV:
def __init__(self,version): self.version,self.classifier = version,CLASSIFIERS[CLASSIFIER] _header(self.version) (self.raw_test_samples,self.raw_train_samples,self.raw_test_names,) = Preprocess._parseAllRawApks()
class SpotOn: def __init__ (self): """ PUBLIC: Constructor ------------------- constructs member objects """ #=====[ Step 1: create member objects ]===== self.preprocess = Preprocess () self.storage_delegate = StorageDelegate () self.semantic_analysis = SemanticAnalysis () self.user_analysis = UserAnalysis () self.inference = None self.activities_corpus = None def load (self): """ PUBLIC: load ------------ loads in all parameters """ #=====[ Step 1: load in semantic analysis ]===== print_status ("Initialization", "Loading ML parameters (Begin)") self.semantic_analysis.load () print_status ("Initialization", "Loading ML parameters (End)") #=====[ Step 2: transfer over models to inference ]===== print_status ("Initialization", "Constructing Inference instance (Begin)") self.inference = Inference (self.semantic_analysis.lda_model, self.semantic_analysis.lda_model_topics) print_status ("Initialization", "Constructing Inference instance (End)") #################################################################################################### ######################[ --- Getting Users --- ]##################################################### #################################################################################################### def get_users (self): """ PUBLIC: get_users ----------------- constructs self.u_df from all available calendar dataframes """ self.u_df = self.user_analysis.extract_users (self.storage_delegate.iter_calendar_dfs) self.u_df = self.semantic_analysis.analyze (self.u_df, 'all_event_names') def load_users (self, filepath='../data/pandas/users/users.df'): """ PUBLIC: load_users ------------------ constructs self.u_df from a saved file """ self.u_df = pd.read_pickle(filepath) #################################################################################################### ######################[ --- Training --- ]######################################################### #################################################################################################### def extract_text (self, activity_row): """ PRIVATE: extract_text --------------------- given a row representing an activity, this returns a list of words representing it as a 'text' """ text = [] if type(activity_row['name']) == list: text += activity_row['name'] if type(activity_row['words']) == list: text += activity_row['words'] return text def get_corpus_dictionary (self): """ PRIVATE: get_corpus_dictionary ------------------------------ Assembles a gensim corpus and dictionary from activities_df, where each text is name || words. """ #=====[ Step 1: iterate through all activity dataframes ]===== print_status ("get_corpus", "assembling texts") documents = [] for df in self.storage_delegate.iter_activity_dfs (): df['lda_doc'] = df['name'] + df['words'] documents += list(df['lda_doc']) #=====[ Step 2: get dictionary ]===== print_status ("get_corpus", "assembling dictionary") dictionary = gensim.corpora.Dictionary(documents) #=====[ Step 3: get corpus ]===== print_status ("get_corpus", "assembling corpus") corpus = [dictionary.doc2bow (d) for d in documents] return corpus, dictionary def print_lda_topics (self): """ PUBLIC: print_lda_topics ------------------------ prints out a representation of the lda topics found in self.semantic_analysis """ print_header ("LDA TOPICS: ") self.semantic_analysis.print_lda_topics () def train_semantic_analysis (self): """ PUBLIC: train_semantic_analysis ------------------------------- finds parameters for self.semantic_analysis """ #=====[ Step 1: get the corpus ]===== print_status ("train_semantic_analysis", "getting corpus/dictionary") corpus, dictionary = self.get_corpus_dictionary () #=====[ Step 2: train ]===== print_status ("train_semantic_analysis", "training semantic analysis") self.semantic_analysis.train (corpus, dictionary) #####[ DEBUG: print out lda topics ]##### self.print_lda_topics () #################################################################################################### ######################[ --- Processing --- ]######################################################## #################################################################################################### def activities_json_to_df (self, a_json): """ PRIVATE: activities_json_to_df ------------------------------ given: list of json dicts representing activities returns: dataframe with preprocessing, semantic analysis """ a_df = self.preprocess.preprocess_a (a_json) a_df = self.semantic_analysis.add_lda_vec_column (a_df) a_df = self.semantic_analysis.add_w2v_sum_column (a_df) return a_df def calendar_events_json_to_df (self, ce_json): """ PRIVATE: calendar_events_json_to_df ------------------------------ given: list of json dicts representing calendar events returns: dataframe with preprocessing, semantic analysis """ ce_df = self.preprocess.preprocess_ce (ce_json) ce_df = self.semantic_analysis.add_lda_vec_column (ce_df) ce_df = self.semantic_analysis.add_w2v_sum_column (ce_df) return ce_df def calendar_events_to_user_representation(self, ce_json): """ PUBLIC: calendar_events_to_user_representation ---------------------------------------------- given a list containing json dicts representing calendar events belonging to a single user, this will return a representation that can be passed to score_activity_for_user and recommend_for_user """ user_df = self.calendar_events_json_to_df (ce_json) lda_vec = self.semantic_analysis.get_user_lda_vec (user_df) return {'events_df':user_df, 'lda_vec':lda_vec} def load_activities_corpus(self, activities): ''' function: load_activities_corpus params: activities - list of activities to recommend returns: none notes: use this function to load a big activities corpus into the SpotOn object, and later when calling recommend_for_user we will pull activities to recommend from this corpus. Can be called multiple times to update to different activities ''' self.activities_corpus = self.activities_json_to_df (activities) #################################################################################################### ######################[ --- Recommending --- ]###################################################### #################################################################################################### def score_activity_for_user(self, user_representation, activity): """ PUBLIC: score_activity_for_user ------------------------------- params: user_representation - representation of the user to score for (created by calendar_events_to_user_representation) activity - json of the activity to score notes: goes from the representation of the user that you use + one activity -> return a score for how much they'd like it """ #=====[ Step 1: get activity dataframe ]===== activity_df = self.activities_json_to_df ([activity]) #=====[ Step 2: get scored dataframe ]===== activity_df = self.inference.infer_scores (user_representation, activity_df) #=====[ Step 3: extract and return score ]===== return activity_df.iloc[0]['score'] def recommend_for_user(self, user_representation, activities=None, topn=10): """ PUBLIC: recommend_for_user -------------------------- params: user_representation - representation of the user to recommend for activities - either a list of json activities, or None if .load_activities_corpus has been called topn - number of recommendations to return """ #=====[ Step 1: get a_df, df of activities to recommend ]===== if activities is not None: activities_df = self.activities_json_to_df (activities) else: if not (self.activities_corpus is not None): self.load_activities_corpus () activities_df = self.activities_corpus #=====[ Step 2: get scores, return sorted ]===== activity_ranks = self.inference.rank_activities (user_representation, activities_df) return list(activity_ranks) def recommend_users_for_activity(self, activity, list_of_users, topn=10): """ PUBLIC: recommend_users_for_activities -------------------------------------- params: activity - activity to recommend users for list_of_users - list of users to filter topn - number of users to return notes: goes from an activity and a list of users -> topn users for that activity """ scores = [self.score_activity_for_user(user, activity) for user in list_of_users] sorted_ix = np.argsort(scores)[::-1] return [list_of_users[sorted_ix[i]] for i in range(topn)]
def getDayScore(self, wordCount): positiveScore = 0 negativeScore = 0 for word, count in wordCount.iteritems(): # print word.encode(ignore), count wordDict = self.wl.getWordDict(word) if wordDict != None: if wordDict['polarity'] == POLARITY['positive']: positiveScore += count elif wordDict['polarity'] == POLARITY['negative']: negativeScore += count if negativeScore == 0: return float(positiveScore) return float(positiveScore) / negativeScore # Return the score timeseries as a list def getScoreTimeseries(self, wordCounterTs): return wordCounterTs.mapValues(self.getDayScore) if __name__ == '__main__': ts = Preprocess.loadTs() sent = SentimentAnalysis() t = time.clock() a = sent.getScoreTimeseries(TimeSeries(ts.getTopicTs('msft'))) print time.clock() - t
# Script: preprocess_json # ----------------------- # script to convert raw json of activities to # a large pandas dataframe. call as 'ipython -i preprocess_json.py' # if you want to pickle or play with the resulting # dataframe afterwards import pickle import json import time from Preprocess import Preprocess from util import * if __name__ == "__main__": print_header ("PREPROCESS JSON - convert raw json to dataframe") start_time = time.time () #=====[ Step 1: create preprocess ]===== pre = Preprocess () #=====[ Step 2: load json into memory ]===== print_status ("preprocess_json", "loading json into memory") json_filename = '/Users/jayhack/Dropbox/Data/Spoton/activities_0_till_168694.json' a_json = json.load (open(json_filename, 'r')) #=====[ Step 3: apply Preprocess to it ]===== print_header ('PREPROCESSING JSON') a_df = pre.preprocess_a (a_json) ######[ PRINT ELAPSED TIME ]##### end_time = time.time () print_notification ('Elapsed time: ' + str(end_time - start_time))