def CheckRSS(phase_url, phase): recent_headline = lf.LoadRecentHeadline(phase) phase_headlines = feedparser.parse(phase_url) entry_len = len(phase_headlines.entries) for entry in range(entry_len): entry_headline = phase_headlines.entries[entry]['title'] if entry_headline == recent_headline: print('Processed {} headlines'.format(entry)) print('Done checking phase ' + str(phase) + ' headlines') return else: if entry == 0: lf.SaveRecentHeadline(entry_headline, phase) GetPrediction(entry_headline, phase)
def pre_process_labeled(save=True, remove_stopwords=False, stem=False): """ Full pre-processing on labeled data. :param save: Optionally save the processed results. :param remove_stopwords: optional. :param stem: optional. """ train, y = lF.load_labeled(current_path + "/Data/train") processed_train = map(lambda r: pre_processing(r, remove_stopwords, stem), train) if save: # Pickle positive examples. positive = np.where(y == 1)[0] with open(current_path + "/Data/Processed/PositiveExamples.txt", 'wb') as text_file: for idx in positive: text_file.write("%s\n" % processed_train[idx]) # Pickle negative examples. negative = np.where(y == 0)[0] with open(current_path + "/Data/Processed/NegativeExamples.txt", 'wb') as text_file: for idx in negative: text_file.write("%s\n" % processed_train[idx])
def GetPrediction(headline, phase): #Breakdown headline security_symbol = BreakdownHeadline(headline) predicted_sent = HeadlineSentiment(headline) today_date = date.today().strftime('%Y%m%d') for symbol in security_symbol: stock_info = GetStockInfo(security_symbol) #Setup headline entry headline_entry = { 'Security': symbol, 'Headline': headline, 'Date': today_date } headline_series = pd.DataFrame(headline_entry, index=[0]) base_shape = headline_series.shape[1] #Get features stock_series = sfp.GetTimeComputes(stock_info) headline_series = pd.concat( [headline_series, pd.DataFrame(stock_series).transpose()], axis=1) #Combine headline info and clinic data clinic_data = lf.LoadClinicPipeline() clinics = pf.PrepareClinicalPipelineData(clinic_data, sub_select=symbol) clinics.reset_index(inplace=True, drop=True) clinics.drop('Security Symbol', axis=1, inplace=True) headline_series = pd.concat([headline_series, clinics], axis=1) #Prepare for model headline_series['Phase'] = phase headline_series['TextSent'] = predicted_sent test_series = headline_series.iloc[:, base_shape:] #Predict predict_model = joblib.load('BiotechModel.joblib') recommendation = predict_model.predict(test_series) headline_series['Prediction'] = recommendation lf.SavePrediction(headline_series) MacNotification(headline, symbol, recommendation)
def BreakdownHeadline(headline_string): usable_companies = lf.LoadCompaniesSimple(str_compatible=True) company_names = usable_companies['Company Name'] headline_string = headline_string.lower() matches = {x for x in company_names if x in headline_string} short_names = ['ra', 'ani'] match_name = {x for x in matches if x in short_names} if len(match_name) > 0: for off_word in match_name: if re.search(rf'({off_word}\S|\S{off_word})', headline_string): matches.remove(off_word) match_security = [ usable_companies.loc[company_names == x, 'Security Symbol'].item() for x in matches ] if len(match_security) == 0: print('No companies found') return match_security
def run_preface(): # Load the files into RAM csv_data_dict = LoadFiles.read_files('../csvFiles/*.csv') # Run the creation of Movie List with the IMDb dataset print('[+] Running Movie list') MovieList.run_movie_list(csv_data_dict) print(' ..Done with Movie list\n') # Run the creation of Knowledge graph with the IMDb dataset print('[+] Running knowledge graph') KnowledgeGraphLoader.run_kgl(csv_data_dict) print(' ..Done with knowledge graph\n') # Run the creation of Word2Vec model with the IMDb dataset print('\n[+] Running model') PrepareData.run_model(csv_data_dict) print(' ..Done with model')
def pre_process_unlabeled(save=True, remove_stopwords=False, stem=False): """ Full pre-processing on unlabeled data. :param save: Optionally save the processed results. :param remove_stopwords: optional. :param stem: optional. """ test, names = lF.load_unknown(current_path + "/Data/test") processed_test = map(lambda r: pre_processing(r, remove_stopwords, stem), test) if save: # Pickle examples. with open(current_path + "/Data/Processed/Unlabeled.txt", 'wb') as text_file: for idx in xrange(len(processed_test)): text_file.write("%s\n" % processed_test[idx])
if __name__ == "__main__": # Set PySparkTWIDF Context and load data. # --------------------- sc = SparkContext( "local", "TW-IDF App", pyFiles=['Projet_TM/Preprocessing.py', 'Projet_TM/LoadFiles.py']) current_path = os.getcwd() print "Loading data..." data, Y = lF.load_labeled("./Projet_TM/train") # data_train, data_test, labels_train, labels_test = train_test_split(data, Y, test_size=0.2, random_state=42) data_rdd = sc.parallelize(data, numSlices=16) # Map data to a binary matrix. # Get the dictionary of the data. # --------------------- print "Pre-processing data and broadcasting the dictionary..." lists = data_rdd \ .map(lambda r: re.compile(r"<[^>]+>").sub('', r)) \ .map(RegexpReplacer().replace) \ .map(lambda r: re.sub(r"\W+", " ", r)) \ .map(lambda r: r.lower().split()) \ .collect()
if __name__ == "__main__": # Set PySparkTWIDF Context and load data. # --------------------- sc = SparkContext( "local", "TW-IDF App", pyFiles=['PySparkTWIDF/Preprocessing.py', 'PySparkTWIDF/LoadFiles.py']) current_path = os.getcwd() print "Loading data..." data, Y = lF.load_labeled(current_path + "/Data/train") data_train, data_test, labels_train, labels_test = train_test_split( data, Y, test_size=0.2, random_state=42) data_rdd = sc.parallelize(data_train, numSlices=16) # Map data to a binary matrix. # Get the dictionary of the data. # --------------------- print "Pre-processing data and broadcasting the dictionary..." lists = data_rdd \ .map(lambda r: re.compile(r"<[^>]+>").sub('', r)) \ .map(RegexpReplacer().replace) \ .map(lambda r: re.sub(r"\W+", " ", r)) \
# Reading an excel file using Python import xlrd import LoadFiles import io # List comparison def Diff(li1, li2): li_dif = [i for i in li1 + li2 if i not in li1 or i not in li2] return li_dif FILESBITCH = LoadFiles.init() print("Comparing files") print("NEW: " + FILESBITCH[1]) print("OLD: " + FILESBITCH[0]) # Give the location of the file locOLD = FILESBITCH[0] locNEW = FILESBITCH[1] # open both workbooks wbOLD = xlrd.open_workbook(locOLD) wbNEW = xlrd.open_workbook(locNEW) sheetOLD = wbOLD.sheet_by_index(0) sheetNEW = wbNEW.sheet_by_index(0) # Load older version list songListOLD = [] for i in range(1, sheetOLD.nrows):
def getApi(self, apiPath): self.auth = LoadFiles.LoadAPI(apiPath).auth
def getSetting(self, setting): self.pref = LoadFiles.LoadSetting(setting).setting
elif (elem == 1): sub_class_1 = sub_class_1 + 1 else: sub_class_2 = sub_class_2 + 1 total = len(kmeans_predicted_test_images) print("Total image labels: ", total) print("Probability for class 0: ", (sub_class_0 / total) * 100) print("Probability for class 1: ", (sub_class_1 / total) * 100) print("Probability for class 2: ", (sub_class_2 / total) * 100) if __name__ == "__main__": # prerequisites file_loader = LoadFiles() loaded_images = file_loader.load_ORL_face_data_set_40x30() loaded_labels = file_loader.load_ORL_labels() kmeans_labels, kmeans_predicted, pca_centers, pca_images_training, test_images_pca = nearest_sub_class_centroid( 5, loaded_images, loaded_labels) calculate_success_rate(kmeans_predicted) plot_data(kmeans_labels, kmeans_predicted, pca_centers, pca_images_training, test_images_pca) training_images = fetch_NSC_training_set(3, loaded_images, loaded_labels) elbow_data = [training_images[i][0] for i in range(len(training_images))] plot_elbow_graph(elbow_data) kmean_labels3, kmeans_predicted3, pca_centers3, pca_images3_training, test_images_pca3 = nearest_sub_class_centroid( 3, loaded_images, loaded_labels)
for j in xrange(1, temp_w): next_word = word_list[k + j] dg.add_edge(word, next_word) if __name__ == "__main__": # Set PySparkTWIDF Context and load data. # --------------------- sc = SparkContext("local", "TW-IDF App", pyFiles=['Projet_TM/Preprocessing.py', 'Projet_TM/LoadFiles.py']) current_path = os.getcwd() print "Loading data..." data, Y = lF.load_labeled("./Projet_TM/train") # data_train, data_test, labels_train, labels_test = train_test_split(data, Y, test_size=0.2, random_state=42) data_rdd = sc.parallelize(data, numSlices=16) # Map data to a binary matrix. # Get the dictionary of the data. # --------------------- print "Pre-processing data and broadcasting the dictionary..." lists = data_rdd \ .map(lambda r: re.compile(r"<[^>]+>").sub('', r)) \ .map(RegexpReplacer().replace) \ .map(lambda r: re.sub(r"\W+", " ", r)) \ .map(lambda r: r.lower().split()) \ .collect()
#Import libraries import pandas as pd import numpy as np import LoadFiles from datetime import datetime, date import StockFeaturePrepare as sfp from PipelineFunctions import GetHeadlineHistory, DefineEventResult #Setup crude prediction pipeline (simplified for git) from sklearn.preprocessing import robust_scale from sklearn.model_selection import KFold from sklearn.svm import SVC #Load files headlines = LoadFiles.LoadHeadlines() companies = LoadFiles.LoadCompanies() stocks = LoadFiles.LoadStocks() #Convert datetime of manual headlines to be compatible with stock data datey = headlines.loc[:, 'Date'].astype(str) new_date = datey.apply( lambda x: datetime.strptime('20' + x, '%Y%m%d').strftime('%Y-%m-%d')) headlines['Stock_date'] = new_date #Addition to the theme of this addition for stock_key in list(stocks.keys()): stock_dates = stocks[stock_key].loc[:, 'Date'].astype(str) new_date = stock_dates.apply( lambda x: datetime.strptime(x, '%m/%d/%y').strftime('%Y-%m-%d')) stocks[stock_key]['Iso_date'] = new_date