예제 #1
0
def CheckRSS(phase_url, phase):
    recent_headline = lf.LoadRecentHeadline(phase)
    phase_headlines = feedparser.parse(phase_url)
    entry_len = len(phase_headlines.entries)
    for entry in range(entry_len):
        entry_headline = phase_headlines.entries[entry]['title']
        if entry_headline == recent_headline:
            print('Processed {} headlines'.format(entry))
            print('Done checking phase ' + str(phase) + ' headlines')
            return
        else:
            if entry == 0:
                lf.SaveRecentHeadline(entry_headline, phase)
            GetPrediction(entry_headline, phase)
예제 #2
0
def pre_process_labeled(save=True,
                        remove_stopwords=False,
                        stem=False):

    """
    Full pre-processing on labeled data.

    :param save: Optionally save the processed results.
    :param remove_stopwords: optional.
    :param stem: optional.
    """

    train, y = lF.load_labeled(current_path + "/Data/train")
    processed_train = map(lambda r: pre_processing(r, remove_stopwords, stem), train)

    if save:
        # Pickle positive examples.
        positive = np.where(y == 1)[0]

        with open(current_path + "/Data/Processed/PositiveExamples.txt", 'wb') as text_file:
            for idx in positive:
                text_file.write("%s\n" % processed_train[idx])

        # Pickle negative examples.
        negative = np.where(y == 0)[0]

        with open(current_path + "/Data/Processed/NegativeExamples.txt", 'wb') as text_file:
            for idx in negative:
                text_file.write("%s\n" % processed_train[idx])
예제 #3
0
def pre_process_labeled(save=True, remove_stopwords=False, stem=False):
    """
    Full pre-processing on labeled data.

    :param save: Optionally save the processed results.
    :param remove_stopwords: optional.
    :param stem: optional.
    """

    train, y = lF.load_labeled(current_path + "/Data/train")
    processed_train = map(lambda r: pre_processing(r, remove_stopwords, stem),
                          train)

    if save:
        # Pickle positive examples.
        positive = np.where(y == 1)[0]

        with open(current_path + "/Data/Processed/PositiveExamples.txt",
                  'wb') as text_file:
            for idx in positive:
                text_file.write("%s\n" % processed_train[idx])

        # Pickle negative examples.
        negative = np.where(y == 0)[0]

        with open(current_path + "/Data/Processed/NegativeExamples.txt",
                  'wb') as text_file:
            for idx in negative:
                text_file.write("%s\n" % processed_train[idx])
예제 #4
0
def GetPrediction(headline, phase):
    #Breakdown headline
    security_symbol = BreakdownHeadline(headline)
    predicted_sent = HeadlineSentiment(headline)
    today_date = date.today().strftime('%Y%m%d')
    for symbol in security_symbol:
        stock_info = GetStockInfo(security_symbol)
        #Setup headline entry
        headline_entry = {
            'Security': symbol,
            'Headline': headline,
            'Date': today_date
        }
        headline_series = pd.DataFrame(headline_entry, index=[0])
        base_shape = headline_series.shape[1]
        #Get features
        stock_series = sfp.GetTimeComputes(stock_info)
        headline_series = pd.concat(
            [headline_series,
             pd.DataFrame(stock_series).transpose()], axis=1)
        #Combine headline info and clinic data
        clinic_data = lf.LoadClinicPipeline()
        clinics = pf.PrepareClinicalPipelineData(clinic_data,
                                                 sub_select=symbol)
        clinics.reset_index(inplace=True, drop=True)
        clinics.drop('Security Symbol', axis=1, inplace=True)
        headline_series = pd.concat([headline_series, clinics], axis=1)
        #Prepare for model
        headline_series['Phase'] = phase
        headline_series['TextSent'] = predicted_sent
        test_series = headline_series.iloc[:, base_shape:]
        #Predict
        predict_model = joblib.load('BiotechModel.joblib')
        recommendation = predict_model.predict(test_series)
        headline_series['Prediction'] = recommendation
        lf.SavePrediction(headline_series)
        MacNotification(headline, symbol, recommendation)
예제 #5
0
def BreakdownHeadline(headline_string):
    usable_companies = lf.LoadCompaniesSimple(str_compatible=True)
    company_names = usable_companies['Company Name']
    headline_string = headline_string.lower()
    matches = {x for x in company_names if x in headline_string}
    short_names = ['ra', 'ani']
    match_name = {x for x in matches if x in short_names}
    if len(match_name) > 0:
        for off_word in match_name:
            if re.search(rf'({off_word}\S|\S{off_word})', headline_string):
                matches.remove(off_word)
    match_security = [
        usable_companies.loc[company_names == x, 'Security Symbol'].item()
        for x in matches
    ]
    if len(match_security) == 0:
        print('No companies found')
    return match_security
예제 #6
0
def run_preface():
    # Load the files into RAM
    csv_data_dict = LoadFiles.read_files('../csvFiles/*.csv')

    # Run the creation of Movie List with the IMDb dataset
    print('[+] Running Movie list')
    MovieList.run_movie_list(csv_data_dict)
    print('     ..Done with Movie list\n')

    # Run the creation of Knowledge graph with the IMDb dataset
    print('[+] Running knowledge graph')
    KnowledgeGraphLoader.run_kgl(csv_data_dict)
    print('     ..Done with knowledge graph\n')

    # Run the creation of Word2Vec model with the IMDb dataset
    print('\n[+] Running model')
    PrepareData.run_model(csv_data_dict)
    print('     ..Done with model')
예제 #7
0
def pre_process_unlabeled(save=True, remove_stopwords=False, stem=False):
    """
    Full pre-processing on unlabeled data.

    :param save: Optionally save the processed results.
    :param remove_stopwords: optional.
    :param stem: optional.
    """

    test, names = lF.load_unknown(current_path + "/Data/test")
    processed_test = map(lambda r: pre_processing(r, remove_stopwords, stem),
                         test)

    if save:

        # Pickle examples.
        with open(current_path + "/Data/Processed/Unlabeled.txt",
                  'wb') as text_file:
            for idx in xrange(len(processed_test)):
                text_file.write("%s\n" % processed_test[idx])
예제 #8
0
def pre_process_unlabeled(save=True,
                          remove_stopwords=False,
                          stem=False):

    """
    Full pre-processing on unlabeled data.

    :param save: Optionally save the processed results.
    :param remove_stopwords: optional.
    :param stem: optional.
    """

    test, names = lF.load_unknown(current_path + "/Data/test")
    processed_test = map(lambda r: pre_processing(r, remove_stopwords, stem), test)

    if save:

        # Pickle examples.
        with open(current_path + "/Data/Processed/Unlabeled.txt", 'wb') as text_file:
            for idx in xrange(len(processed_test)):
                text_file.write("%s\n" % processed_test[idx])
예제 #9
0

if __name__ == "__main__":

    # Set PySparkTWIDF Context and load data.
    # ---------------------

    sc = SparkContext(
        "local",
        "TW-IDF App",
        pyFiles=['Projet_TM/Preprocessing.py', 'Projet_TM/LoadFiles.py'])
    current_path = os.getcwd()

    print "Loading data..."

    data, Y = lF.load_labeled("./Projet_TM/train")
    # data_train, data_test, labels_train, labels_test = train_test_split(data, Y, test_size=0.2, random_state=42)
    data_rdd = sc.parallelize(data, numSlices=16)

    # Map data to a binary matrix.
    # Get the dictionary of the data.
    # ---------------------

    print "Pre-processing data and broadcasting the dictionary..."

    lists = data_rdd \
        .map(lambda r: re.compile(r"<[^>]+>").sub('', r)) \
        .map(RegexpReplacer().replace) \
        .map(lambda r: re.sub(r"\W+", " ", r)) \
        .map(lambda r: r.lower().split()) \
        .collect()
예제 #10
0

if __name__ == "__main__":

    # Set PySparkTWIDF Context and load data.
    # ---------------------

    sc = SparkContext(
        "local",
        "TW-IDF App",
        pyFiles=['PySparkTWIDF/Preprocessing.py', 'PySparkTWIDF/LoadFiles.py'])
    current_path = os.getcwd()

    print "Loading data..."

    data, Y = lF.load_labeled(current_path + "/Data/train")

    data_train, data_test, labels_train, labels_test = train_test_split(
        data, Y, test_size=0.2, random_state=42)
    data_rdd = sc.parallelize(data_train, numSlices=16)

    # Map data to a binary matrix.
    # Get the dictionary of the data.
    # ---------------------

    print "Pre-processing data and broadcasting the dictionary..."

    lists = data_rdd \
        .map(lambda r: re.compile(r"<[^>]+>").sub('', r)) \
        .map(RegexpReplacer().replace) \
        .map(lambda r: re.sub(r"\W+", " ", r)) \
예제 #11
0
# Reading an excel file using Python
import xlrd
import LoadFiles
import io


# List comparison
def Diff(li1, li2):
    li_dif = [i for i in li1 + li2 if i not in li1 or i not in li2]
    return li_dif


FILESBITCH = LoadFiles.init()

print("Comparing files")
print("NEW: " + FILESBITCH[1])
print("OLD: " + FILESBITCH[0])

# Give the location of the file
locOLD = FILESBITCH[0]
locNEW = FILESBITCH[1]

# open both workbooks
wbOLD = xlrd.open_workbook(locOLD)
wbNEW = xlrd.open_workbook(locNEW)
sheetOLD = wbOLD.sheet_by_index(0)
sheetNEW = wbNEW.sheet_by_index(0)

# Load older version list
songListOLD = []
for i in range(1, sheetOLD.nrows):
예제 #12
0
 def getApi(self, apiPath):
     self.auth = LoadFiles.LoadAPI(apiPath).auth
예제 #13
0
 def getSetting(self, setting):
     self.pref = LoadFiles.LoadSetting(setting).setting
예제 #14
0
        elif (elem == 1):
            sub_class_1 = sub_class_1 + 1
        else:
            sub_class_2 = sub_class_2 + 1

    total = len(kmeans_predicted_test_images)

    print("Total image labels: ", total)
    print("Probability for class 0: ", (sub_class_0 / total) * 100)
    print("Probability for class 1: ", (sub_class_1 / total) * 100)
    print("Probability for class 2: ", (sub_class_2 / total) * 100)


if __name__ == "__main__":
    # prerequisites
    file_loader = LoadFiles()
    loaded_images = file_loader.load_ORL_face_data_set_40x30()
    loaded_labels = file_loader.load_ORL_labels()

    kmeans_labels, kmeans_predicted, pca_centers, pca_images_training, test_images_pca = nearest_sub_class_centroid(
        5, loaded_images, loaded_labels)
    calculate_success_rate(kmeans_predicted)
    plot_data(kmeans_labels, kmeans_predicted, pca_centers,
              pca_images_training, test_images_pca)

    training_images = fetch_NSC_training_set(3, loaded_images, loaded_labels)
    elbow_data = [training_images[i][0] for i in range(len(training_images))]
    plot_elbow_graph(elbow_data)

    kmean_labels3, kmeans_predicted3, pca_centers3, pca_images3_training, test_images_pca3 = nearest_sub_class_centroid(
        3, loaded_images, loaded_labels)
예제 #15
0
        for j in xrange(1, temp_w):
            next_word = word_list[k + j]
            dg.add_edge(word, next_word)


if __name__ == "__main__":

    # Set PySparkTWIDF Context and load data.
    # ---------------------

    sc = SparkContext("local", "TW-IDF App", pyFiles=['Projet_TM/Preprocessing.py', 'Projet_TM/LoadFiles.py'])
    current_path = os.getcwd()

    print "Loading data..."

    data, Y = lF.load_labeled("./Projet_TM/train")
    # data_train, data_test, labels_train, labels_test = train_test_split(data, Y, test_size=0.2, random_state=42)
    data_rdd = sc.parallelize(data, numSlices=16)

    # Map data to a binary matrix.
    # Get the dictionary of the data.
    # ---------------------

    print "Pre-processing data and broadcasting the dictionary..."

    lists = data_rdd \
        .map(lambda r: re.compile(r"<[^>]+>").sub('', r)) \
        .map(RegexpReplacer().replace) \
        .map(lambda r: re.sub(r"\W+", " ", r)) \
        .map(lambda r: r.lower().split()) \
        .collect()
예제 #16
0
#Import libraries
import pandas as pd
import numpy as np
import LoadFiles
from datetime import datetime, date
import StockFeaturePrepare as sfp
from PipelineFunctions import GetHeadlineHistory, DefineEventResult

#Setup crude prediction pipeline (simplified for git)
from sklearn.preprocessing import robust_scale
from sklearn.model_selection import KFold
from sklearn.svm import SVC

#Load files
headlines = LoadFiles.LoadHeadlines()
companies = LoadFiles.LoadCompanies()
stocks = LoadFiles.LoadStocks()

#Convert datetime of manual headlines to be compatible with stock data
datey = headlines.loc[:, 'Date'].astype(str)
new_date = datey.apply(
    lambda x: datetime.strptime('20' + x, '%Y%m%d').strftime('%Y-%m-%d'))
headlines['Stock_date'] = new_date

#Addition to the theme of this addition
for stock_key in list(stocks.keys()):
    stock_dates = stocks[stock_key].loc[:, 'Date'].astype(str)
    new_date = stock_dates.apply(
        lambda x: datetime.strptime(x, '%m/%d/%y').strftime('%Y-%m-%d'))
    stocks[stock_key]['Iso_date'] = new_date