def load_data(file):
    mapping = load_embeddings('glove.6B.50d.txt')
    preprocess = Preprocess()
    file = open(file, "r")
    data_x = []
    data_y = []
    sentence = []
    categories = []
    for line in file:
        if len(line.split()) == 0:
            sentence, categories = add_padding(sentence, categories, 20)
            data_x.append(sentence)
            data_y.append(categories)
            sentence = []
            categories = []
        else:
            if (line.split()[0]).lower() in mapping:
                word_embedding = mapping[(line.split()[0]).lower()]
                word_category = one_hot_encoding(line.split()[2])
                sentence.append(word_embedding)
                categories.append(word_category)
            else:
                sentence.append(np.zeros(50))
                categories.append(one_hot_encoding(line.split()[2]))
    data_x = np.array(data_x)
    data_y = np.array(data_y)

    return data_x, data_y
Example #2
0
def main(args):
    pp = Preprocess()

    # ---- Run Spotlight ----
    # graphs = pp.parse_tcp_dump(args.tcp_folder, args.csv_file)

    # tcp = ad.read_csv_file(args.csv_file)
    # graphs = tcp.iloc[:, [0, 1, 3]]
    # graphs.columns = ['source', 'destination',  'hours_past']
    #
    # run_spotlight(args, np.array(graphs))

    # # ---- Run Shingle Sketch -----
    # graph_utils.create_graphs(args.csv_file, args.graph_folder)
    is_gexf = False
    graphs = pp.preprocess_gfiles(args.graph_folder)

    # #--- For Muta or Chemical Data ----
    # graphs = pp.preprocess_gexf(args.gexffile)
    # is_gexf = True
    #
    # #---For DOS Attack Data ---
    # graphs = pp.preprocess_single_gfile("data/dos.g")

    run_shingle(args, graphs, is_gexf)

    ad = AnomalyDetection()
    skvector = ad.read_sketch(args.sketch_vector)
    print(skvector.shape)
    ad.anomaly_detection(skvector, args)
Example #3
0
def algorithm(df, params):
    """
    wrapper function to put each individual algorithm inside
    :param df: dataframe that contains all the input dataset
    :param params: algorithm specific parameters
    :return: a dictionary of { outputname: output content in memory }
    """

    output = {}

    # algorithm specific code
    # construct sentiment analysis
    PP = Preprocess(df, params['column'])

    output['phrases'] = PP.get_phrases()
    output['filtered'] = filtered_tokens = PP.get_words()
    output['processed'] = processed_tokens = PP.stem_lematize(
        params['process'], filtered_tokens)
    output['tagged'] = PP.tagging(params['tagger'], processed_tokens)
    filtered_most_common, processed_most_common = PP.most_frequent(
        filtered_tokens, processed_tokens)
    output['most_common'] = processed_most_common

    # plot
    index = []
    counts = []
    for common in processed_most_common[1:51]:
        index.append(common[0])
        counts.append(common[1])
    title = 'Top 50 frequent words (' + params['process'] + ')'
    output['div'] = plot.plot_bar_chart(index, counts, title)

    return output
Example #4
0
    def __init__(self, image=False, images=[], GTPath=""):
        if images:
            try:
                self.homogenized = numpy.array(Image.open(images[0]))
                self.vesselEnhanced = numpy.array(Image.open(images[1]))
                self.images = images
            except IndexError:
                print("""`images` parameter must include the homogenized image 
                    at `images[0]` and vessel enhanced image at `images[1]`""")
                raise
        else:
            self.preprocess          = Preprocess(image)
            self.homogenized    = self.preprocess.process(enhance=False).image_array
            self.vesselEnhanced = self.preprocess.process(onlyEnhance=True).image_array
            self.mask           = self.preprocess.mask
            self.source         = image
            self.image          = Image.open(image)
            self.loaded         = self.image.load()
        if len(GTPath):
            self.gt             = True
            self.groundtruth    = Image.open(GTPath)
        else:
            self.gt             = False

        self.feature_array      = numpy.empty(0)
def load_data(file):
    mapping = load_embeddings('glove.6B.50d.txt')
    preprocess = Preprocess()
    data = pd.read_csv(file,
                       encoding='latin-1',
                       names=['sentiment', 'id', 'date', 'q', 'nick', 'tweet'])
    data = data.sample(frac=1)
    data = data[:100000]

    data_x = []
    data_y = []
    for index in data.index:
        row = data.loc[index, :]
        if row['sentiment'] != 2:
            row['tweet'] = preprocess.preprocess(row['tweet'])
            tweet = []
            for word in row['tweet'].split():
                if word in mapping:
                    word_embedding = mapping[word]
                    tweet.append(word_embedding)
                else:
                    tweet.append(np.zeros(50))
            tweet = add_padding(tweet, 20)
            data_x.append(tweet)
            data_y.append(one_hot_encoding(row['sentiment']))
    data_x = np.array(data_x)
    data_y = np.array(data_y)

    return data_x, data_y
Example #6
0
    def __init__(self):

        classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz"

        # scenario 1
        # classifier_path2 = "stanford/id-ner-model-half.ser.gz"
        # scenario 2
        # classifier_path2 = "stanford/id-ner-model-id.ser.gz"
        # scenario 3
        # classifier_path2 = "stanford/id-ner-model-2.ser.gz"
        ner_jar_path = "stanford/stanford-ner.jar"

        # for handling error nltk internals
        nltk.internals.config_java(options='-xmx5g')

        self.pre = Preprocess()
        self.scp = StanfordParser(
            './stanford/stanford-parser.jar',
            './stanford/stanford-parser-3.9.1-models.jar',
            encoding='utf8')
        self.ner_tagger = StanfordNERTagger(classifier_path1,
                                            ner_jar_path,
                                            encoding='utf8')  # for scenario 3
        self.pos_tagger = StanfordPOSTagger(
            './stanford/english-bidirectional-distsim.tagger',
            './stanford/stanford-postagger.jar',
            encoding='utf8')
        # combining classifier from Stanford with custom classifier
        # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2
        self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
Example #7
0
   def parse(self, response):
      ''' This method is called repeatedly to process documents from the URL frontier.

      Scrapy handles compliance of Politeness policies    
      '''

      url = response.request.url

      # Remove html tags from the document
      raw_text = GetText(response.body)

      # Preprocess the document's content
      tokens = Preprocess(raw_text)

      # Add document to be stored in local storage
      if self.count < LIMIT:
         self.dstore.add_document(tokens, response.body, url)

      # Extract url references and add them to the url frontier
      for a in response.css('a'):
         if 'href' in a.attrib:
            yield response.follow(a, callback=self.parse)

      # Limit of pages to crawl
      if self.count > LIMIT:
         raise CloseSpider(reason='reached_limit')    # Force spider to close

      print(str(self.count) + '\n\n')     # IGNORE/COMMENT THIS
      
      self.count += 1
Example #8
0
    def __init__(self, feature_list, **kwargs):
        """create a policy object that preprocesses according to feature_list and uses
		a neural network specified by keyword arguments (see create_network())
		"""
        self.preprocessor = Preprocess(feature_list)
        kwargs["input_dim"] = self.preprocessor.output_dim
        self.model = CNNPolicy.create_network(**kwargs)
        self.forward = self._model_forward()
    def test_feature_match(self,tresh,retest,img1,imgref):
            
        print("Test_feature_match")  
        
        process = Preprocess("NA","NA")
        fonte = cv2.FONT_HERSHEY_SIMPLEX 
        
        try:
            x_detect,y_detect,score=process.feature_match(img1, imgref)   
            print(str("Score of Feature Match") + str(score))
        except:
            score=0
            print("except aqui")    
        finally:         
            url,CustomerName,Division,SerialNumber,AssemblyNumber,TesterName,ProcessStep,Operator = get_data_to_test()
            print("Teste de Serial:" + str(SerialNumber))        
            now = datetime.now()
            dt_string = now.strftime("%d_%m_%Y_%H%M%S")          
            if SerialNumber=="":
                SerialNumber=str("No_Serial" + str(dt_string))
                print(SerialNumber)
  
            #-----Inclusão de Serial Number---------

            self.Set_Serial_TestTime_List(SerialNumber)       
            
            #RESULT OF TEST 
            if(score>int(tresh)):
                cv2.putText(img1, "PASS - LABEL DETECTED", (50, 400), fonte, 3, (0,255,0), 3, cv2.LINE_AA)           
                cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA)
                send_test_result("P")
                #cv2.imwrite("./logs/" + str(SerialNumber)+ "_pass.jpg",img1)    
            
            elif(score<int(tresh)) and (score>=0):
                cv2.putText(img1, "FAIL- NO LABEL", (50, 400), fonte, 3, (0,0,255), 3, cv2.LINE_AA)
                cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA)
                
                #send_test_result("F")

                print("retest numbers:")
                print(str(self.Count_Serial_TestTime_Occurence(SerialNumber)))
                print(str(self.Get_Retest_Times_Before_Fail()))
                
                
                if (self.Count_Serial_TestTime_Occurence(SerialNumber) > self.Get_Retest_Times_Before_Fail()):
                    send_test_result("F")
                    #send_test_result_parser(ResultMes="F",Fail_Description=str("FAIL FIRMWARE VERSION "+ str(string)))
                    cv2.putText(img1, "MES REJECTION"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA)
                else:
                    cv2.putText(img1, "RETEST NUMBER:"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA)

                #cv2.imwrite("./logs/" + str(SerialNumber) +"_fail.jpg",img1)  

            #cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA)
        
        return score
Example #10
0
def main(argv):
    pre = Preprocess(argv[0], argv[1])
    pre.build_vectors()
    dataset = ToxicityDataset(pre.vectors, pre.targets)
    # Without sentiment
    # gru = GRU(360).double()
    # With sentiment
    gru = GRU(373).double()
    if use_GPU:
        gru.cuda()
    training.train(gru, dataset, 2, 4, 0.1, use_gpu=use_GPU)
Example #11
0
 def preprocess(self, new, vector, chi2):
     self.prepro = Preprocess(self.data_set, new)
     if new == 'True':
         if vector == 'hashing':
             self.prepro.hashVector()
         if vector == 'tfidf':
             self.prepro.tfidfVector()
         # print self.preprocess.y_train
     else:
         self.prepro.vectorize(vector)
     if chi2:
         self.prepro.chisquare()
def get_data():
    files = os.listdir('./MealNoMealData')
    meal_data_files = []
    no_meal_data_files = []
    for file in files:
        if 'Nomeal' in file:
            no_meal_data_files.append(os.path.join('./MealNoMealData', file))
        else:
            meal_data_files.append(os.path.join('./MealNoMealData', file))

    data = []

    labels = []
    for meal_data_file, no_meal_data_file in zip(meal_data_files,
                                                 no_meal_data_files):

        preprocess_obj = Preprocess(meal_data_file)
        meal_df = preprocess_obj.get_dataframe()
        meal_features = Features(meal_df)
        meal_features.compute_features()
        # temp_meal_features = meal_features.pca_decomposition().tolist()
        temp_meal_features = meal_features.get_features()
        labels += [1] * len(temp_meal_features)

        preprocess_obj_ = Preprocess(no_meal_data_file)
        no_meal_df = preprocess_obj_.get_dataframe()
        no_meal_features = Features(no_meal_df)
        no_meal_features.compute_features()
        no_meal_features_ = no_meal_features.get_features()
        # no_meal_final_features = meal_features.pca.transform(no_meal_features_).tolist()
        no_meal_final_features = no_meal_features_
        labels += [0] * len(no_meal_features_)

        for no_meal_feature in no_meal_final_features:
            temp_meal_features.append(no_meal_feature)

        for meal_no_meal_feature in temp_meal_features:
            data.append(meal_no_meal_feature)

    return data, labels
Example #13
0
 def define_model(self, name):
     if self.is_trained == False:
         if name == 'preprocInc':
             #self.mod = MultinomialNB()
             self.mod = Pipeline([('what', Preprocess()),
                                  ('a pain',
                                   MultinomialNB(alpha=0.05,
                                                 fit_prior=False,
                                                 class_prior=None))])
         else:
             print(
                 'Error selecting the model, choose by default Gaussian NB')
             self.mod = MultinomialNB()
     else:
         print("Model already load")
Example #14
0
    def queryProcess(self):
        preprocess = Preprocess()
        self.query = self.query.lower()
        self.query = preprocess.preprocess(self.query)

        tokenizer = RegexpTokenizer(r"[\d-]+\w+|[A-Z][.A-Z]+\b\.*|[\w-]+|'.*'")
        self.query_tokens = tokenizer.tokenize(self.query)

        if self.query_tokens[0] in wh_qstn_words:
            self.query_type = 1
        elif self.query_tokens[0] in ab_qstn_words:
            self.query_type = 2
        elif self.query_tokens[0] in desc_qstn_words:
            self.query_type = 3
        else:
            self.query_type = 4
Example #15
0
def lambda_handler(event, context):
    # TODO implement

    json_data = json.loads(event['body'])
    preprocess = Preprocess(json_data=json_data)
    preprocess.scale_points(calculate_scale=False)

    pose_objects = preprocess.new_pose_objects

    features = []

    features_obj = Features(pose_objects=pose_objects)
    features_obj.compute_features()
    features = features_obj.get_features()
    # pca_model = pickle.load(open('pca.pkl', 'rb'))
    # reduced_feature_matrix = pca_model.transform(features)

    s3 = boto3.resource('s3')

    svm_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("SVM_model.pkl").get()
        ['Body'].read())

    logreg_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("LogReg_model.pkl").get()
        ['Body'].read())

    lda_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("LDA_model.pkl").get()
        ['Body'].read())

    random_forest_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("RForest_model.pkl").get()
        ['Body'].read())

    prediction_rf = random_forest_classifier.predict(features)
    prediction_svm = svm_classifier.predict(features)
    prediction_lda = lda_classifier.predict(features)
    prediction_logreg = logreg_classifier.predict(features)

    data = {
        "1": prediction_svm[0],
        "2": prediction_logreg[0],
        "3": prediction_lda[0],
        "4": prediction_rf[0]
    }
    return {'statusCode': 200, 'body': json.dumps(data)}
Example #16
0
import torch
from preprocessing import Preprocess
import json
from dataset import QADataset
from transformers import BertModel
import os
from model import Answer
from solver import Solver
import sys

arg = sys.argv
ctx_max_len = 475
question_max = 30
pre = Preprocess(ctx_max_len=ctx_max_len, question_max=question_max)
data = {}
if not os.path.isdir('processed_data'):
    os.mkdir('processed_data')
if not os.path.isdir('ckpt'):
    os.mkdir('ckpt')
#
if arg[1] == 'train':
    for name in ['dev', 'train']:

        if not os.path.isfile(f'processed_data/{name}.pkl'):
            print(f"Start {name}......")
            with open(f"data/{name}.json") as f:
                file = json.load(f)
                file = [data for data in file['data']]

            pre_data = pre.preprocess_data(file,
                                           train=not (name == 'test'),
Example #17
0
from preprocessing import Preprocess
from activity import Activity
from threshold import Threshold
import pandas as pd
import os

# Read raw data from file
raw_data_frame = Preprocess("raw_data/girlbosskaty_tweets.csv", header=0)

# Select the Time and username column from raw data
data_time_uid = raw_data_frame.get_columns(["Screen_Name", "Time"])

# print(data_time_uid)

# Calculate Activity
act = Activity(data_time_uid)
dic_act = act.export_times()

# print(dic_act)

myThresh = Threshold(dic_act)
# print(myThresh.apply_clock_threshold(start_time="01:00:00", stop_time="05:00:00"),
#       "tweets between %s and %s" % (myThresh.start_time, myThresh.stop_time))
# print(myThresh.ckeck_day_tweets())

WeekDay_counter = myThresh.ckeck_day_tweets()
night_tweet_counter = myThresh.apply_clock_threshold(start_time="01:00:00",
                                                     stop_time="05:00:00")

print(WeekDay_counter)
Example #18
0
#
# print(len(X), len(Y))
# clf_rforest = Classification('RForest', X, Y)
# clf_rforest.get_classifier_object()
# clf_rforest.get_metrics()
# pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb'))
# print()
files = os.listdir('./data/gift')
for file in files:
    file_path = os.path.join('./data/gift', file)
    with open(file_path, encoding="utf-8") as data:
        json_data = json.load(data)

    # print(json_data)

    preprocess = Preprocess(json_data=json_data)
    preprocess.scale_points(calculate_scale=False)

    pose_objects = preprocess.new_pose_objects

    features = []

    features_obj = Features(pose_objects=pose_objects)
    features_obj.compute_features()
    features = features_obj.get_features()
    pca_model = pickle.load(open('pca.pkl', 'rb'))
    # reduced_feature_matrix = pca_model.transform(features)

    random_forest_classifier = pickle.load(open('RForest_model.pkl', 'rb'))

    # prediction = random_forest_classifier.predict(reduced_feature_matrix)
Example #19
0
from preprocessing import Preprocess

if __name__ == '__main__':

    #os.system('sudo raspivid -br 80')
    cam = Camera(1280, 1080, dispositivo=1, camera_type='WEBCAM')
    cam.set_focus(25)
    cam.set_exposure(100)
    cam.set_exposure_auto(3)

    #Inicializa Testplan
    testplan = Testplan(produto='solo', posto=1)
    imReference = testplan.get_imgRef()

    #Inicializa Modelo de Preprocessamento
    preprocess = Preprocess(produto='solo', posto=1)

    while True:

        ret, frame1 = cam.camera_read()
        frame1 = cv2.resize(frame1, (640, 480), interpolation=cv2.INTER_CUBIC)

        preprocess.executa_preprocessamento(imgFrame=frame1,
                                            imgRef=imReference)
        #preprocess.segmentation(frame1)
        imReg, frame2, Result = preprocess.custom_processing(
            imReference, frame1)

        if (Result == True):

            testplan.executa_teste(imReg)
Example #20
0
    def train(self, train_data, time_info):
        self.new_model_history_label = []
        self.lgb_predict_list = []
        self.linear_predict_list = []
        self.new_model_n_predict = 0

        self.data = train_data
        gc.collect()

        self.data['changed_y'] = self.data[self.label].copy()
        self.preprocess = Preprocess()
        self.preprocess.train_preprocess(self)

        if self.n_predict == 0:
            tt, interval, na_num = time_interval(
                self.data[self.primary_timestamp])
            with time_limit("featParamsad"):
                self.featParamsad = FeatParams(copy.deepcopy(self), tt,
                                               interval, na_num)
                self.featParamsad.fit_transform()

        gc.collect()

        self.feat_engine = Feat_engine(self.featParamsad)
        self.feat_engine.same_feat_train(self)
        self.feat_engine.history_feat_train(self)

        if self.use_sample_weight:
            TransExponentialDecay(self.primary_timestamp,
                                  init=1.0,
                                  finish=0.75,
                                  offset=0).fit(train_data)

        gc.collect()

        col = self.data.any()
        col = col[col].index
        self.data = self.data[col]
        gc.collect()

        X = self.data

        categorical_feature = []
        self.last_drop_col.append(self.primary_timestamp)

        if self.n_predict == 0:
            y = self.data.pop(self.label)
            y1 = self.data['changed_y']
            X_train, y_train, X_eval, y_eval = time_train_test_split(
                X, y, self.primary_timestamp, shuffle=False)
            if self.time_seg:
                seg_num = len(X_train) // self.time_seg
                X_train['time_seg'] = [
                    (((i // seg_num) + 1) if
                     ((i // seg_num) + 1) <= self.time_seg else self.time_seg)
                    for i in range(len(X_train))
                ]
                X_eval['time_seg'] = self.time_seg

            self.lgb_model.param_opt_new(X_train, y_train, X_eval, y_eval,
                                         categorical_feature, self.primary_id,
                                         self.primary_agg,
                                         self.primary_timestamp)
            X_train.drop(self.last_drop_col, axis=1, inplace=True)

            _, sc1 = self.lgb_model.valid_fit(X_train,
                                              y_train,
                                              X_eval,
                                              y_eval,
                                              categorical_feature,
                                              self.use_sample_weight,
                                              round=100)
            if (y != y1).any():
                y_train = y1[:len(y_train)]
                mod1 = self.lgb_model.model
                self.lgb_model.model = None
                _, sc2 = self.lgb_model.valid_fit(X_train,
                                                  y_train,
                                                  X_eval,
                                                  y_eval,
                                                  categorical_feature,
                                                  self.use_sample_weight,
                                                  round=100)
                if sc2 < sc1:
                    gc.collect()
                    self.use_exp_y = False
                    y = y1
                else:
                    y_train = y[:len(y_train)]
                    self.lgb_model.model = mod1
            lgb_preds, _ = self.lgb_model.valid_fit(X_train, y_train, X_eval,
                                                    y_eval,
                                                    categorical_feature,
                                                    self.use_sample_weight)

            col = X_train.any()
            col = col[col].index
            X_train = X_train[col]
            X_eval = X_eval[col]
            gc.collect()
            linear_preds = self.linear_model.valid_fit(X_train, y_train,
                                                       X_eval, y_eval,
                                                       self.use_sample_weight)
            gc.collect()
            if self.tmpControlType == 1:
                self.linear_weight, self.lgb_weight = 1, 0
            elif self.tmpControlType == 2:
                self.linear_weight, self.lgb_weight = 0, 1
            else:
                self.linear_weight, self.lgb_weight = serch_best_fusion_proportion(
                    linear_preds, lgb_preds, y_eval)
        else:
            if not self.use_exp_y:
                self.data[self.label] = self.data['changed_y'].copy()
            y = self.data.pop(self.label)
            self.data.pop('changed_y')

        X.drop(self.last_drop_col, axis=1, inplace=True)

        if self.time_seg:
            seg_num = len(X) // self.time_seg
            X['time_seg'] = [
                (((i // seg_num) + 1) if
                 ((i // seg_num) + 1) <= self.time_seg else self.time_seg)
                for i in range(len(X))
            ]

        with time_limit("linear_fit"):
            self.linear_model.fit(X, y, self.use_sample_weight)

        with time_limit("fit"):
            self.lgb_model.fit(X, y, categorical_feature,
                               self.use_sample_weight)
        next_step = 'predict'
        return next_step
Example #21
0
def preprocess(args):
    prep = Preprocess(path_to_links=args.path_to_links,
                      path_to_movies=args.path_to_movies)
    prep.preprocess(path_show_data=args.path_to_overall,
                    path_alg_data=args.path_to_alg)
Example #22
0
#%%
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
directory = '0105_clothing_ncf_it6_lr1e05'

trainingEpoch = 20

trainOption = True
validationOption = not True
testOption = True

# %%
"""
Setup for preprocessing
"""
pre_work = Preprocess()
num_of_reviews = 5
batch_size = 16
num_of_rating = 1
num_of_validate = 3

# %%
selectTable = 'clothing_'
res, itemObj, userObj = pre_work.loadData(
    havingCount=6,
    sqlfile="HNAE/SQL/[email protected]",
    LIMIT=1000,
    testing=False,
    table='clothing_')  # for clothing.
# res, itemObj, userObj = loadData(havingCount=20, LIMIT=2000, testing=False, table='elec_')  # for elec.
# res, itemObj, userObj = pre_work.loadData(havingCount=15, LIMIT=1000, testing=False, table='toys_')  # for toys
Example #23
0
# coding:utf-8
"""
building the AHTM models.
"""
import nltk
import codecs
import sys
import pandas
import numpy
import gensim
from preprocessing import Preprocess
from gensim import corpora, models, similarities
import pyLDAvis
p = Preprocess()
print "\n"
pm = p.multi_process()
train_set = pm[0]
# [ [],
#   [],...
#     ]
before_word_num = pm[1]
after_word_num = pm[2]
# print before_word_num
# print after_word_num

for i in train_set:
    for j in i:
        if len(j) <= 1:
            print "有长度为1的存在哦。"

dic = corpora.Dictionary(train_set)
Example #24
0
import pickle
from preprocessing import Preprocess
from features import Features
import numpy as np
import pandas as pd


test_file_name = input("Please enter the test file name: ")
preprocess_obj = Preprocess(test_file_name)
test_file_dataframe = preprocess_obj.get_dataframe()
test_file_features_obj = Features(test_file_dataframe)
test_file_features_obj.compute_features()
test_file_features = test_file_features_obj.get_features()
# print(len(test_file_features))

# Random Forest
random_forest_clf = pickle.load(open('RForest_model.pkl', 'rb'))
y_pred = random_forest_clf.predict(test_file_features)
print('Saving the output of RandomForest classifier prediction')
rforest_dataframe = pd.DataFrame(y_pred, columns=['Meal/NoMeal'])
rforest_dataframe.to_csv('RForest_output.csv')

# AdaBoost
adaboost_clf = pickle.load(open('Adaboost_model.pkl', 'rb'))
y_pred = adaboost_clf.predict(test_file_features)
print('Saving the output of AdaBoost classifier prediction')
adaboost_dataframe = pd.DataFrame(y_pred, columns=['Meal/NoMeal'])
adaboost_dataframe.to_csv('Adaboost_output.csv')

# XGBoost
XGBoost_clf = pickle.load(open('XGBoost_model.pkl', 'rb'))
Example #25
0
 def __init__(self):
     self.pre = Preprocess()
     self.nlp = NLPHelper()
     self.fex = FeatureExtractor()
     self.ut = Utility()
     self.mt = ModelTrainer()
from preprocessing import Preprocess
from classification import Classification
import pickle
from features import Features
import json
import os

preprocess = Preprocess()
preprocess.scale_points()

pose_objects = preprocess.new_pose_objects

features = []

features_obj = Features(pose_objects=pose_objects)
features_obj.compute_features()
# reduced_feature_matrix = features_obj.compute_pca()

# print(reduced_feature_matrix)
# print(len(reduced_feature_matrix),len(reduced_feature_matrix[0]))

# X = reduced_feature_matrix
X = features_obj.get_features()
Y = [obj.label for obj in pose_objects]

print(len(X), len(Y))
clf_rforest = Classification('RForest', X, Y)
clf_rforest.get_classifier_object()
clf_rforest.get_metrics()
pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb'))
print()
def main():
    
    opt = parse()
    model_path = "RESULT/"+ opt.save + "/model"
    vocab_path = "RESULT/" + opt.save + "/vocab"
    os.makedirs(model_path, exist_ok=True)
    os.makedirs(vocab_path, exist_ok=True)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    device = torch.device("cuda:0")

    opt.log = "RESULT/" + opt.save + "/log"
    opt.save_model = model_path

    # write a setting 
    with open(opt.log, "a") as f:
        f.write("-----setting-----\n")
        f.write("MAX ITERATION : %d \
                \nCHECK INTERVAL : %d \
                \nBATCH SIZE : %d \
                \nACCUMULATION STEPS : %d \
                \nWORD CUT : %d \
                \nD_MODEL : %d \
                \nN_LAYERS : %d \
                \nN_HEAD : %d \
                \nDROPOUT : %.1f \
                \nMODE : %s \
                \nSAVE_MODEL : %s \
                \nLOG_PATH : %s \
                \nGPU NAME: %s \
                \nGPU NUM %s \
                \nDATASET : \n%s\n%s\n%s\n%s\n%s\n%s" \
                    %(opt.max_steps, \
                    opt.check_interval, \
                    opt.batch_size, \
                    opt.accumulation_steps, \
                    opt.word_cut, \
                    opt.d_model, \
                    opt.n_layers, \
                    opt.n_head, \
                    opt.dropout, \
                    opt.mode, \
                    opt.save, \
                    opt.log, \
                    torch.cuda.get_device_name(), \
                    opt.gpu, \
                    opt.train_src, \
                    opt.train_trg, \
                    opt.valid_src, \
                    opt.valid_trg, \
                    opt.test_src, \
                    opt.test_trg))

    #gradient accumulation
    opt.batch_size = int(opt.batch_size/opt.accumulation_steps)
    opt.batch_max_token = int(opt.batch_max_token/opt.accumulation_steps)
    opt.check_interval = int(opt.check_interval * opt.accumulation_steps)
    opt.max_steps = int(opt.max_steps * opt.accumulation_steps)

    #前処理
    source_vocab_path = "RESULT/" + opt.save + "/vocab/source_vocab"
    target_vocab_path = "RESULT/" + opt.save + "/vocab/target_vocab"

    SRC = Preprocess()
    TRG = Preprocess()

    train_source, valid_source, test_source = \
        SRC.load(train=opt.train_src,
                valid=opt.valid_src, 
                test = opt.test_src, 
                mode=1, 
                vocab_file=source_vocab_path)
    
    train_target, valid_target, test_target = \
        TRG.load(train=opt.train_trg,
                valid=opt.valid_trg, 
                test = opt.test_trg, 
                mode=1, 
                vocab_file=target_vocab_path)

    #SrcDict = SRC.reverse_dict
    TrgDict = TRG.reverse_dict
    src_size = len(SRC.dict)
    trg_size = len(TRG.dict)
    pad_idx = SRC.dict["<pad>"]
    trg_sos_idx = TRG.dict["<sos>"]
    trg_eos_idx = TRG.dict["<eos>"]

    #create batch sampler with the number of sentence
    train_batch_sampler = create_sentence_batch_sampler(train_source, train_target, opt.batch_size)
    valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size)

    #create batch sampler with the number of token
    #train_batch_sampler = create_token_batch_sampler(train_source, train_target, opt.batch_max_token)
    #valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size)
    
    #create dataset and dataloader
    train_data_set = MyDataset(train_source, train_target)
    valid_data_set = MyDataset(valid_source, valid_target)
    valid_data_loader = DataLoader(valid_data_set, batch_sampler=valid_batch_sampler, collate_fn=valid_data_set.collater)
    test_data_set = MyDataset(test_source, test_target)
    test_data_loader = DataLoader(test_data_set, batch_size=1, collate_fn=test_data_set.collater, shuffle=False)

    #train
    if opt.mode == "full" or opt.mode == "train":
        model = Transformer(src_size, trg_size, opt.d_model, opt.n_layers, opt.n_head, opt.dropout).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9)
        scheduler = LambdaLR(optimizer, lr_lambda=lr_schedule)
        model, optimizer = amp.initialize(model, optimizer, opt_level=opt.level)

        trainer = Trainer(
            model = model,
            optimizer = optimizer,
            train_data_set = train_data_set,
            train_batch_sampler = train_batch_sampler,
            valid_data_loader = valid_data_loader,
            lr_scheduler = scheduler,
            device = device,
            TrgDict = TrgDict,
            pad_idx = pad_idx
            )
            
        trainer.train(opt.epoch, opt)

    #test
    if opt.mode == "full" or opt.mode == "test":
        load_point = opt.max_steps//opt.check_interval
        model = average_model(load_point, opt, src_size, trg_size, device)

        torch.cuda.empty_cache()
        beam_size = 4
        max_seq_len = 410
        translator = Translator(
            model = model,
            test_data_loader = test_data_loader,
            TrgDict = TrgDict,
            device = device,
            beam_size =  beam_size,
            max_seq_len = max_seq_len,
            src_pad_idx = pad_idx,
            trg_pad_idx = pad_idx,
            trg_bos_idx = trg_sos_idx,
            trg_eos_idx = trg_eos_idx)
        
        translator.test(opt.save)
Example #28
0
# readin the training and testing files in the dataframe
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

# converting the columns into list with UTF-8 encoding
train_text = train_data['Review Text'].values.astype('U').tolist()
train_title = train_data['Review Title'].values.astype('U').tolist()
train_rating = train_data['Star Rating'].values.astype('U').tolist()
test_text = test_data['Review Text'].values.astype('U').tolist()

# converting the rating into integer
for i in range(len(train_rating)):
    train_rating[i] = int(train_rating[i])

# instanciating the Preprocessrt,Vectorizer,DatasetBalancer and Classifier
pre = Preprocess()
v = Vectors()
balance = DatasetBalance()

# preprocess the train data
x = []
for i in train_text:
    x.append(" ".join(pre.preprocessing(i)))

print("traing data ready")

print("now making count vectors")
# making count vectors
x_cv = v.count_vectors(x, train=True)

y = train_rating
Example #29
0
	rht = Removing HTML tags
	rurls = Revoing Urls 
	rn = Removing Numbers
	ntw = convert numbers to words
	sc = Spelling Correction
	ata = convert accented to ASCII code
	sto = short_to_original
	ec = Expanding Contractions
	ps = Stemming (Porter Stemming)
	l = Lemmatization
	re = Removing Emojis
	ret = Removing Emoticons
	ew = Convert Emojis to words
	etw = Convert Emoticons to words
	rp = Removing Punctuations
	rs = Removing Stopwords
	rfw = Removing Frequent Words
	rrw = Removing Rare Words
	rsc = Removing Single characters
	res = Removing Extra Spaces
"""
print(f"******** Before preprocessing technique ******* ")
for sent in sentences[:5]:
    print(sent)
preprocessing = Preprocess()

preprocessed_text = preprocessing.preprocessing(sentences, techniques)
print(f"******** After preprocessing ****************")
for sent in preprocessed_text[:5]:
    print(sent)
Example #30
0
def get_tweet_name_time(input_file):
    name_time = Preprocess(input_file,
                           header=0).get_columns(["Screen_Name", "Time"])
    return name_time