def createBigramTrigram(infile, outfile, type):

    #read file with loader module
    loader = Loader()
    items = loader.read_file(infile)

    plt = []

    #create bigram or trigram
    for tweet in items:
        if type == 'b':
            plt.append(bigramas(tweet['text']))
        else:
            plt.append(trigramas(tweet['text']))

    #Pass all strings list to a single string
    palavras = ' '
    for i in range(len(plt)):
        for j in range(len(plt[i])):
            palavras += plt[i][j].replace('\n', ' ').replace('\t', '') + ' '

    #Formatting
    count = {}
    for word in palavras.split(" "):
        if len(word) < 0:
            continue
        if word not in count:
            count[word] = 0
        count[word] += 1

    l = sorted(count.items(), key=lambda x: -x[1])

    write_file(infile, outfile, l)
Ejemplo n.º 2
0
def createDict(infile):

    #read file with loader module
    loader = Loader()
    items = loader.read_file(infile)

    #create dict()
    dic = collections.defaultdict(list)

    for tweet in items:
        aux = tweet['user_name']
        #Check if key exists in dictionary using in operator
        if aux in dic:
            dic[aux] = dic[aux] + 1
        else:
            dic[aux] = 1

    #sorted dictionary
    list_x = sorted(dic.items(), key=lambda kv: kv[1], reverse=True)
    return list_x
Ejemplo n.º 3
0
def sanitize(infile, outfile, stopwords, emoji, rt):
    #initialize cleaner and load stopwords

    cleaner = TweetCleaner()
    stopwords = cleaner.load_stopwords(stopwords)

    #read file with loader module
    loader = Loader()
    items = loader.read_file(infile)

    #remove stopwords and emoji from tweets
    for tweet in items:
        tweet['text'] = cleaner.standardize_quotes(tweet['text'])
        tweet['text'] = cleaner.clean_apostrophe_s(tweet['text'])
        tweet['text'] = cleaner.remove_urls(tweet['text'])
        tweet['text'] = cleaner.remove_symbols(tweet['text'])
        tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords)
        if not emoji:
            tweet['text'] = cleaner.remove_emoji(tweet['text'])
        if rt:
            cleaner.remove_rts(items, tweet)

    write_file(infile, outfile, items)
Ejemplo n.º 4
0
def load_pays():
    load = Loader(
        "https://simplonline-v3-prod.s3.eu-west-3.amazonaws.com/media/file/csv/25d9c746-3622-4c48-835e-d7ccafa311f5.csv"
        , "../datas/RAW/"
    )
    csv_path = load.ensure_data_loaded()
    pec.clean_csv(
        '../datas/RAW/' + csv_path,
        "../datas/CURATED/pays_en_chiffre.csv")
    pec.jsonify_csv(
        "../datas/CURATED/pays_en_chiffre.csv", 
        "../datas/CURATED/pays_en_chiffre.json"
    )

    pays: dict = json.load(open("../datas/CURATED/pays_en_chiffre.json"))

    if mongo.db.pays:
        mongo.db.pays.drop()
    db_pays = mongo.db["pays"]
    db_pays.insert_many(pays)

    return jsonify(
        etat="success"
    )
from utils.constants import VILLE_NAME
from modules.loader import Loader
from modules.cleaner import Cleaner
from modules.analyser import Analyzer
from modules.saver import Saver
import datetime as dt
import pandas as pd

save_path = 'D:\\Users\\Yuan.ZHANG\\PycharmProjects\\compa0516\\data_save'
CURRENT_TIME_AP = '2018-05-15'
CURRENT_TIME_INT = '2018_05_15'
Intfilename_lst = [
    "BDDExportInterventions-{} du 01_01_2013 au 15_05_2018.xlsx".format(
        CURRENT_TIME_INT)
]
loader = Loader(datadir="D:\\Users\Yuan.ZHANG\\PycharmProjects\\data")

saver = Saver()
cleaner = Cleaner()
analyzer = Analyzer()

# # standardize the format of the dataframe
# for ville in VILLE_NAME:
#     # rename the dataframe,remove redundant info and save
#     data_Arm = loader.load_ArmPL(foldername=ville,filename="BDDExport_ArmoireBt_{}_{}.xlsx".format(ville,CURRENT_TIME_AP), NAME_LIST=Armoire_NAME)
#     data_PL = loader.load_ArmPL(foldername=ville,filename="BDDExport_PointLumineux_{}_{}.xlsx".format(ville,CURRENT_TIME_AP), NAME_LIST=PL_NAME)
#     data_Int = loader.load_Intervention(foldername=ville,filename_lst=Intfilename_lst, NAME_LIST=Int_NAME)
#
#     data_Arm = cleaner.rv_dupRow(data_Arm)
#     data_Ar = cleaner.rep_dur(data_Arm, Var_lst=Armoire_TIME, currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0))
#     data_PL = cleaner.rv_dupRow(data_PL)
Ejemplo n.º 6
0
from utils.constants import Armoire_NAME, Armoire_ARM_CAT, Armoire_DEPART_CAT, Armoire_TIME, Armoire_ARM_DIST
from modules.loader import Loader
from modules.cleaner import Cleaner
from modules.analyser import Analyzer
import datetime as dt

loader = Loader(
    datadir=
    "/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea")
cleaner = Cleaner()
analyser = Analyzer()

# load the data
data_Ar = loader.load_ArmPL(
    filename="BDDExport_ArmoireBt_NOUMEA_2018-05-15.xlsx",
    NAME_LIST=Armoire_NAME)

# remove the duplicated rows and replace the date with the duration
data_Ar = cleaner.rv_dupRow(data_Ar)
data_Ar = cleaner.rep_dur(data_Ar,
                          Var_lst=Armoire_TIME,
                          currtime=dt.datetime(2018, 5, 5, 0, 0, 0, 0))

# generate the count for NAN for all the variables
analyser.gen_NAN_excel(data_Ar.iloc[:, 0:43], 'Armoire_arm', 'Armoire_arm_or')
analyser.gen_NAN_excel(data_Ar.iloc[:, 43:], 'Armoire_depart',
                       'Armoire_depart_or')

# pick the variables and regroup
data_Ar_arm = analyser.pick_Var(data=data_Ar,
                                Var_lst=Armoire_ARM_CAT + Armoire_ARM_DIST)
Ejemplo n.º 7
0
with open('body_face_sample.pickle', 'rb') as f:
    body_face_samples: {str: Sample} = pickle.load(f)
with open('car_sample.pickle', 'rb') as f:
    car_samples: {str: Sample} = pickle.load(f)

custom_samples = body_face_samples.copy()
custom_samples.update(car_samples)

keys = list(custom_samples.keys())
random.shuffle(keys)
custom_samples = {key: custom_samples[key] for key in keys}

settings = ProjectSettings("settings.yaml")

# Load the label mapping.
loader = Loader()
loader.load_labels(settings.LABELS_FILE)

body_face_labels = [
    '/m/04yx4', '/m/03bt1vf', '/m/01g317', '/m/05r655', '/m/01bl7v',
    '/m/0dzct', '/m/04hgtk'
]

car_labels = ['/m/01prls']

for key, value in custom_samples.items():
    labelled_image = value.get_visualized_image_custom_label(
        label_map_function=loader.get_label,
        custom_label=car_labels + body_face_labels)
    cv2.imwrite(
        ProjectSettings.instance().CUSTOM_LABELLED_DIRECTORY + key + '.jpg',
Ejemplo n.º 8
0
from utils.constants import PL_NAME, PL_TIME,PL_PL_CAT,PL_PL_DIST,PL_LAN_CAT,PL_LAN_DIST
from modules.loader import Loader
from modules.cleaner import Cleaner
from modules.analyser import Analyzer
import datetime as dt

loader = Loader(datadir="/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea")
cleaner = Cleaner()
analyser = Analyzer()
data_PL = loader.load_ArmPL(filename="BDDExport_PointLumineux_NOUMEA_2018-05-15.xlsx", NAME_LIST=PL_NAME)

data_PL = cleaner.rv_dupRow(data_PL)
data_PL = cleaner.rep_dur(data_PL,Var_lst=PL_TIME,currtime=dt.datetime(2018, 5, 5, 0, 0, 0, 0))

data_PL_PL = analyser.pick_Var(data=data_PL, Var_lst=PL_PL_CAT+PL_PL_DIST)
data_PL_LAN = analyser.pick_Var(data=data_PL, Var_lst=PL_LAN_CAT+PL_LAN_DIST)
data_PL_PL = cleaner.rv_dupRow(data_PL_PL)

analyser.gen_NAN_excel(data_PL.iloc[:,0:60],'PL_PL','PL_PL_or')
analyser.gen_NAN_excel(data_PL.iloc[:,60:],'PL_LAN','PL_LAN_or')

analyser.gen_histogram_Pie(data_PL_PL, 'PL_PL', Var_lst=PL_PL_CAT)
analyser.gen_histogram_Pie(data_PL_LAN, 'PL_LAN', Var_lst=PL_LAN_CAT)

analyser.gen_Dist(data_PL, 'PL_PL', Var_lst=PL_PL_DIST)
analyser.gen_Dist(data_PL_LAN,'PL_LAN',Var_lst=PL_LAN_DIST)

analyser.gen_NAN_excel(data_PL_PL,'PL_PL','PL_PL')
analyser.gen_NAN_excel(data_PL_LAN,'PL_LAN','PL_LAN')
Ejemplo n.º 9
0
from modules.analyser import Analyzer
from modules.loader import Loader
import numpy as np

date_str = '0723'
analyzer = Analyzer(datestr=date_str)
loader = Loader(date_str)


# ArmInt_cluster = loader.load_excel(filename='ArmInt_cluster',foldername='Cluster')
# ArmInt_cluster.drop(['PanneDelai_1'], axis=1,inplace=True)
# feature_names = np.array(list(ArmInt_cluster.columns))
#
# clf = loader.load_pickle('Randomforest_Armoire')
# analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='Randomforest_featureimportance_Armoire',top_n=40)
#
# clf = loader.load_pickle('GradientBoosting_Armoire')
# analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='GradientBoosting_featureimportance_Armoire',top_n=40)
#



PL_cluster = loader.load_excel(filename='PL_cluster',foldername='Cluster')
PL_cluster.drop(['PanneDelai_1'], axis=1,inplace=True)
feature_names = np.array(list(PL_cluster.columns))

clf = loader.load_pickle('Randomforest_PL')
analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='Randomforest_featureimportance_PL',top_n=40)

clf = loader.load_pickle('GradientBoosting_PL')
analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='GradientBoosting_featureimportance_PL',top_n=40)
Ejemplo n.º 10
0
from modules.preprocessor import Processor
import datetime as dt
import seaborn as sns
import os
import pandas as pd
from utils.constants import Armoire_MERGE,Int_MERGE,PL_MERGE
import sklearn.feature_selection
from sklearn.feature_selection import f_regression, mutual_info_regression
from scipy.cluster.hierarchy import linkage


# the date of saving the data
date_str='0723'
analyzer = Analyzer(datestr=date_str)
cleaner = Cleaner()
loader = Loader(datestr=date_str)
saver = Saver(datestr=date_str)
processor = Processor(datestr=date_str)
plotter = Plotter(datestr=date_str)
cluster = Cluster(datestr=date_str)
modeler = Modeler(datestr=date_str)

data_downloadtime = dt.datetime(2018, 5, 15, 0, 0, 0, 0)
data_starttime = dt.datetime(2013, 1, 1, 0, 0, 0, 0)
day_difference = (data_downloadtime - data_starttime).days

CURRENT_TIME_AP = '2018-05-15'
CURRENT_TIME_INT = '2018_05_15'
Intfilename_lst = ["BDDExportInterventions-{} du 01_01_2013 au 15_05_2018.xlsx".format(CURRENT_TIME_INT)]

"""Attention: for this project, I dupmp the data of BOGOR
Ejemplo n.º 11
0
with open('./config.json') as configfile:
    config = json.loads("".join(configfile.readlines()))
data_dir = config["data_dir"]

neural_network_config = config["neural-network"]
reload_weights = neural_network_config["reload_weights"]

weight_dir = neural_network_config["weight_dir"]

if len(sys.argv) > 1:
    epochs = int(sys.argv[1])
else:
    epochs = neural_network_config["epochs"]

# Utility classes
loader = Loader(neural_network_config)

# Load stuff
X = np.load(data_dir + "/features.npy")
Y = np.load(data_dir + "/labels.npy")

# Training stuff
x_train, y_train, x_test, y_test = loader.load(X, Y)
x_train, y_train, x_test, y_test = x_train[:,:,1:], y_train[:,:,1:], x_test[:,:,1:], y_test[:,:,1:]

input_dim = x_train.shape[2]

neural = Neural(input_dim, neural_network_config)
neural.set_callbacks(EarlyStop(5))
if reload_weights:
    neural.load(weight_dir + "/weights")
                        help="How many do we want to visualize?")
    return parser.parse_args()


args = get_args()
set_index = args.set_index
sample_count = args.sample_count

if __name__ == "__main__":

    # Load the project settings and required modules.
    Logger.log_special("Running Sample Loader", with_gap=True)
    settings = ProjectSettings("settings.yaml")

    # Load the label mapping.
    loader = Loader()
    loader.load_labels(settings.LABELS_FILE)
    Logger.log_field("Labels Loaded", len(loader.label_map))

    # Load the samples from the set that we want.
    samples = Loader.load_sample_set(set_index)
    loaded_samples = [
        s for s in samples
        if (s.is_locally_loaded and len(s.detect_regions) > 0)
    ]

    # How many samples loaded?
    n_loaded_samples = len(loaded_samples)
    Logger.log_field("Samples with Images", n_loaded_samples)
    if n_loaded_samples == 0:
        raise Exception(
from utils.constants import Int_NAME, Int_TIME ,Int_INT_CAT,Int_INT_DIST,Int_PAN_CAT,Int_PAN_DIST
from modules.loader import Loader
from modules.cleaner import Cleaner
from modules.analyser import Analyzer
import datetime as dt

loader = Loader(datadir="/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea")
cleaner = Cleaner()
analyser = Analyzer()

Intfilename_lst = ["BDDExportInterventions-2018_05_15 du 01_01_2013 au 15_05_2018.xlsx"]

data_Int = loader.load_Intervention(filename_lst=Intfilename_lst, NAME_LIST=Int_NAME)
data_Int = cleaner.rv_dupRow(data_Int)
data_Int = cleaner.rep_dur(data_Int,Var_lst=Int_TIME,currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0))

data_Int_PAN = analyser.pick_Var(data=data_Int, Var_lst=Int_PAN_CAT+Int_PAN_DIST+['pan_Code'])
data_Int_PAN = cleaner.rv_dupRow(data=data_Int_PAN,Var_lst=['pan_Code'])
data_Int_INT = analyser.pick_Var(data=data_Int, Var_lst=Int_INT_CAT+Int_INT_DIST)
data_Int_INT = cleaner.rv_dupRow(data_Int_INT)

analyser.gen_NAN_excel(data_Int.iloc[:,0:23],'Intervention_int','Intervention_int_or')
analyser.gen_NAN_excel(data_Int.iloc[:,23:],'Intervention_pan','Intervention_pan_or')

analyser.gen_histogram_Pie(data_Int_INT, 'Intervention_int', Var_lst=Int_INT_CAT)
analyser.gen_histogram_Pie(data_Int_PAN, 'Intervention_pan', Var_lst=Int_PAN_CAT)

analyser.gen_Dist(data_Int_INT, 'Intervention_int', Var_lst=Int_INT_DIST)
analyser.gen_Dist(data_Int_PAN, 'Intervention_pan',Var_lst=Int_PAN_DIST)

analyser.gen_NAN_excel(data_Int_INT,'Intervention_int','Intervention_int')
Ejemplo n.º 14
0
    # Load the project settings and required modules.
    Logger.log_special("Running Sample Loader", with_gap=True)
    settings = ProjectSettings("settings.yaml")

    set_path = os.path.join(settings.SAMPLES_DIRECTORY,
                            f"sample_set_{set_index}.json")
    if not os.path.exists(set_path):
        Logger.log_field(
            "Error",
            "No file found at {}. Have you created the samples using cmd_create_samples yet?"
        )
        exit(1)

    Logger.log_special("Begin Sample Image Download", with_gap=True)
    samples = Loader.load_sample_set_from_file(set_path)
    unloaded_samples = [s for s in samples if not s.is_locally_loaded]
    n_unloaded_samples = len(unloaded_samples)
    n_samples = len(samples)
    Logger.log_field("Samples Loaded",
                     "{}/{}".format(n_samples - n_unloaded_samples, n_samples))

    i = 0

    for sample in unloaded_samples:
        while True:
            if threading.active_count() <= max_threads:
                thread = threading.Thread(target=sample.load)
                thread.start()
                break
            else:
Ejemplo n.º 15
0
from modules.preprocessing import Processor
from modules.loader import Loader
from modules.analyzer import Analyzer
from utils.constants import Var_NAME,STOP_LIST
from modules.cleaner import Cleaner
from modules.plotter import Plotter
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


savepath = "/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/Call0502/save_data/"
# ==================load the data==================
loader = Loader()
Call_file = "Reporting Call Freshmile.xlsx"
raw_data = loader.load_Callfile(filename=Call_file, NAME_LIST=Var_NAME)

# ================merge "problem" and "action"====================
processor = Processor()
# raw_data = processor.merge_col(data,Var_lst=['Problem','Action'])

# ================remove stop words, numbers, punctuation, operator, tokenize, stemming===========
cleaner = Cleaner()
data = cleaner.remove_digits_dataframe(raw_data,var='Problem')
data.to_excel(savepath+'tp_rv_digits.xlsx')

data = cleaner.remove_punctuation_dataframe(data,var='Problem')
data.to_excel(savepath+'tp_rv_punctuation.xlsx')

data = cleaner.remove_stop_words_dataframe(data,stopwords_to_add=STOP_LIST,var='Problem')
data.to_excel(savepath+'tp_rv_stopwords.xlsx')
Ejemplo n.º 16
0
def train(labels_array, nb_epochs, nb_patience):

    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Conv2D, GlobalAveragePooling2D, GlobalMaxPooling2D, MaxPooling2D, AveragePooling2D, Activation, Dropout, Flatten, Dense
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.preprocessing import image

    import mlflow
    import mlflow.tensorflow

    # Telechargement du ZIP
    from modules.loader import Loader

    loader = Loader(
        "https://stdatalake010.blob.core.windows.net/public/cifar-100.zip",
        '../datas/ZIP/',
        extraction_target='../datas/RAW/'
    )
    loader.ensure_data_loaded()

    # Extraction du jeu de donnees
    from modules.splitting import Splitting

    labels_array = ['apple', 'bee']

    TRAIN_DATA_DIR = Splitting.copie_dossiers(
        '../datas/RAW/train',
        labels_array,
        500,
        explorer=False
    )

    print(TRAIN_DATA_DIR)

    # Chargement des images
    image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(validation_split=0.2)
    TRAIN_IMAGE_SIZE = 32
    TRAIN_BATCH_SIZE = 64

    train_generator = image_data_generator.flow_from_directory(
        TRAIN_DATA_DIR,
        target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE),
        batch_size=TRAIN_BATCH_SIZE,
        class_mode='categorical',
        subset='training')
    
    validation_generator = image_data_generator.flow_from_directory(
        TRAIN_DATA_DIR, # same directory as training data
        target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE),
        batch_size=TRAIN_BATCH_SIZE,
        class_mode='categorical',
        subset='validation')

    with mlflow.start_run():

        model = Sequential()
        model.add(Conv2D(32, kernel_size=3, activation='elu', kernel_initializer='he_uniform', padding='same', input_shape=(32,32,3)))
        #Toujours à la fin
        model.add(Flatten())
        model.add(Dense(2, activation='softmax'))

        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        es_callback = EarlyStopping(monitor='val_loss', patience=nb_patience)
        training = model.fit(train_generator, epochs=nb_epochs, callbacks=[es_callback], validation_data=validation_generator, shuffle=False)
Ejemplo n.º 17
0
def report(infile, outfile, displaycount):
    #initialize cleaner and load stopwords
    cleaner = TweetCleaner()
    stopwords = cleaner.load_stopwords(['stopwords/stopwords_en.txt', 'stopwords/stopwords_pt-br.txt'])

    #read file with loader module
    print('Reading file. This may take a while...')
    loader = Loader()
    items = loader.read_file(infile)
    print('File read successfully!\nProcessing the summary...')

    if 'text' not in items[0]:
        print("Warning: 'text' key is required.\nTerminating...")
        sys.exit(0)

    tweet_count = len(items)

    summary = "File name: " + infile + '\n'
    summary += "Tweet count: " + str(tweet_count) + "\n\n"

    if 'created_at' in items[0]:
        #created_at exists
        date_upper = items[0]['created_at']
        date_lower = items[tweet_count - 1]['created_at']

        summary += "Most recent tweet: " + date_upper + "\n"
        summary += "Oldest tweet: " + date_lower + "\n"
    elif 'date' in items[0]:
        date_upper = items[0]['date']
        date_lower = items[tweet_count - 1]['date']

        summary += "Most recent tweet: " + date_upper + "\n"
        summary += "Oldest tweet: " + date_lower + "\n"
    else:
          summary += "Warning: 'created_at' or 'date' key does not exist. Date range information cannot be fetched."

    username_key = get_username_key(items[0])

    if 'retweets' in items[0]:
        summary+='\nTop retweeted tweets:\n'
        cont = 0
        for tweet in sorted(items, reverse=True, key = lambda i: i['retweets']):
            if 'RT @' not in tweet['text'] and cont < displaycount:
                summary+= format_print_tweet(tweet, username_key)
                cont+=1
            if cont>=10:
                break


    word_list = []
    hashtag_list = []
    user_list = []

    for tweet in items:
        tweet['text'] = cleaner.standardize_quotes(tweet['text'])
        tweet['text'] = cleaner.clean_apostrophe_s(tweet['text'])
        tweet['text'] = cleaner.remove_urls(tweet['text'])
        tweet['text'] = cleaner.remove_symbols(tweet['text'])
        tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords)
        tweet['text'] = cleaner.remove_emoji(tweet['text'])
        tweet['text'] = tweet['text'].lower()

    for tweet in items:
        #print(re.findall(r'#\w+', tweet['text']))
        hashtag_list += re.findall(r'#\w+', tweet['text'])
        user_list += re.findall(r'@\w+', tweet['text'])
        word_list += re.findall(r'\b\w+', tweet['text'])


    word_dict = {}
    hashtag_dict = {}
    user_dict = {}

    for hashtag in hashtag_list:
        if hashtag in hashtag_dict:
            hashtag_dict[hashtag] += 1
        else:
            hashtag_dict[hashtag] = 1

    for user in user_list:
        if user in user_dict:
            user_dict[user] += 1
        else:
            user_dict[user] = 1

    for word in word_list:
        if word in word_dict:
            word_dict[word] += 1
        else:
            word_dict[word] = 1


    summary+='\n\nWord ranking:\n\n'
    count = 0
    for key, value in sorted(list(word_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])):
         if count < displaycount:
             summary+= '\t%s: %s\n' % (key, value)
         count +=1

    summary+='\nUser ranking:\n\n'
    count = 0
    for key, value in sorted(list(user_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])):
         if count < displaycount:
             summary+= '\t%s: %s\n' % (key, value)
         count +=1


    count = 0
    summary+='\nHashtag ranking:\n\n'
    for key, value in sorted(list(hashtag_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])):
         if count < displaycount:
             summary+= '\t%s: %s\n' % (key, value)
         count +=1

    with open(outfile, 'w', encoding='utf8') as f:
        f.write(summary)

    print('Succesfully wrote file to ' + outfile + '!')
Ejemplo n.º 18
0
# Check the MUT CAT vars and NL vars, find proper methods for processing and generate related constants
from modules.analyser import Analyzer
from modules.loader import Loader
from modules.saver import Saver
from modules.preprocessor import Processor
from utils.constants import VILLE_NAME, Armoire_PICK, Int_PICK, PL_PICK
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import os

date_str = '0723'
analyzer = Analyzer(datestr=date_str)
loader = Loader(datestr=date_str)
saver = Saver(datestr=date_str)
processor = Processor(datestr=date_str)
"""
MUL CAT vars
PL: lampe_Type
INT: pan_Solde, int_Solde, int_ElemDefaut, int_TypeTnt, int_TypeEqt, pan_TypeEqt, pan_Defaut, int_Defaut

NL vars

"""
## lampe_Type: {}
## int_ElemDefaut: {'cover':['Crosse','Vasque','Enveloppe exterieure','Support','Coffret'],
## 'electricity':['Armorceur','Platine','Lampe','Câbles','Appareillage','Ballast','Protection électrique'],
## 'else':['NA','Luminaire','Armoire départ','Horloge','Alimentation générale']}
# This is the maximum number of samples that a single 'set' will contain.
MAX_SAMPLE_SET_SIZE = 5000

# Remote URLs
REMOTE_IMAGE_URL_FILE = "https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/train-images" \
                        "-boxable.csv "
REMOTE_GROUND_TRUTH_FILE = "https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/train" \
                           "-annotations-bbox.csv "

if __name__ == "__main__":

    # Load the project settings and required modules.
    Logger.log_special("Running Sample Creator", with_gap=True)
    settings = ProjectSettings("settings.yaml")
    loader: Loader = Loader()

    # Read in the source data, and create our own sample data.
    Logger.log_special("Begin Sample Initialization", with_gap=True)
    loader.check_and_load(settings.IMAGE_URL_FILE, REMOTE_IMAGE_URL_FILE)
    samples = loader.create_samples(settings.IMAGE_URL_FILE)

    # Now that we have sample IDs and URLs, we can associate them with the GT annotations.
    Logger.log_special("Begin Sample Association", with_gap=True)
    loader.check_and_load(settings.IMAGE_URL_FILE, REMOTE_GROUND_TRUTH_FILE)
    loader.associate_boxes_with_samples(samples, settings.GROUND_TRUTH_FILE)

    # Exporting the created samples.
    Logger.log_special("Begin Sample Export", with_gap=True)
    pather.create(settings.SAMPLES_DIRECTORY)
    loader.export_samples(samples, path=settings.SAMPLES_DIRECTORY, size=5000)
Ejemplo n.º 20
0
body_labels = ['/m/04yx4', '/m/03bt1vf', '/m/01g317', '/m/05r655', '/m/01bl7v']

# face_labels = ['HUMAN FACE', 'HUMAN HEAD']
face_labels = ['/m/0dzct', '/m/04hgtk']

# car_labels = ['Land vehicle']
car_labels = ['/m/01prls']

if __name__ == "__main__":

    # Load the project settings and required modules.
    Logger.log_special("Running Sample Analysis", with_gap=True)
    settings = ProjectSettings("settings.yaml")

    # Load the class labels.
    loader = Loader()
    loader.load_labels(settings.LABELS_FILE)

    # Get ALL of the samples in the directory.
    samples = []
    sample_files = os.listdir(settings.SAMPLES_DIRECTORY)
    for i in sample_files[:20]:
        file_path = os.path.join(settings.SAMPLES_DIRECTORY, i)
        samples += Loader.load_sample_set_from_file(file_path)

    class_instances = {}
    class_appearances = {}

    for key in loader.label_map:
        class_instances[key] = 0
        class_appearances[key] = 0
Ejemplo n.º 21
0
import math
import numpy as np

from modules.loader import Loader
from modules.neural import Neural

# Configuration
with open('./config.json') as configfile:
    config = json.loads("".join(configfile.readlines()))
data_dir = config["data_dir"]

neural_network_config = config["neural-network"]
weight_dir = neural_network_config["weight_dir"]

# Utility classes
loader = Loader(neural_network_config)


def to_python_list(array):
    array = np.reshape(array, (-1))
    return [
        np.asscalar(a) if not math.isnan(np.asscalar(a)) else None
        for a in np.array(array)
    ]


class Predicter:
    def __init__(self, config):
        self.time_frame_size = config["time_frame_size"]

    def predict(self, dates):