Exemple #1
0
	def classify_comment(self, comment, classifier_type='SVM', no_classes=5):
		sentiment = None
		preprocessor = Preprocessing()
		vectorizer = VectorQuantization()
		if classifier_type == 'SVM':
			preprocessed_comment = preprocessor.preprocessing(comment)
			comment_vector = vectorizer.morphosyntactic_vector(preprocessed_comment)
			sentiment = classifier_svm.classify(comment_vector)
			if no_classes == 5:
				return sentiment
			elif no_classes == 3:
				if sentiment == u'positivo' or sentiment == u'muy_positivo':
					return u'positivo'
				elif sentiment == u'negativo' or sentiment == u'muy_negativo':
					return u'negativo'
				else:
					return sentiment
		elif classifier_type == 'MNB':
			preprocessed_comment = preprocessor.preprocessing(comment)
			comment_vector = vectorizer.bigram_vector(preprocessed_comment)
			sentiment = classifier_mnb.classify(comment_vector)
			if no_classes == 5:
				return sentiment
			elif no_classes == 3:
				if sentiment == u'positivo' or sentiment == u'muy_positivo':
					return u'positivo'
				elif sentiment == u'negativo' or sentiment == u'muy_negativo':
					return u'negativo'
				else:
					return sentiment
Exemple #2
0
	def preprocess_tweets(self):
		""" Process tweets according to mode and set arrays """
		processObject = Preprocessing(self.mode, self.tweets)
		processObject.preprocess_tweets()
		if ( "stem" in self.mode):
			self.stemmed_tweets_array = processObject.stemmed_tweets_array
		if ( "token" in self.mode):
			self.tokenized_tweets_array = processObject.tokenized_tweets_array
		if ( "pos" in self.mode): 
			self.pos_tweets_array = processObject.pos_tweets_array
		if ( "lemma" in self.mode):
			self.lemmatized_tweets_array = processObject.lemmatized_tweets_array
Exemple #3
0
 def preprocess_tweets(self, mode, tweets_dict, filename):
     """ Process tweets according to mode and set arrays """
     processObject = Preprocessing(mode, tweets_dict, filename)
     processObject.preprocess_tweets()
     if "stem" in mode:
         self.stemmed_tweets_array = processObject.stemmed_tweets_array
     if "token" in mode:
         self.tokenized_tweets_array = processObject.tokenized_tweets_array
     if "pos" in mode:
         self.pos_tweets_array = processObject.pos_tweets_array
     if "lemma" in mode:
         self.lemmatized_tweets_array = processObject.lemmatized_tweets_array
Exemple #4
0
def main(args, config):
    wDir = os.getcwd()
    #Instance Preprocessing class
    window = Preprocessing(args.fasta_file, config['win_length'], config['win_step'])
    window.output_window()
    print >> sys.stderr, "Creating windows_sequence.fasta"
    
    #Instance Similarity and Composition class
    sim = Similarity(args.fasta_file, config['score_adj'],wDir)
    sim_matrix = sim.mcl_perform() 
    comp_results = Composition(config['kmer_len'])
    comp_matrix = comp_results.joined()
    #Join similarity and composition matrix for PCA
    join = pd.concat([comp_matrix, sim_matrix], axis= 1, join='inner')
    print >> sys.stderr, "Calculating similarity and composition matrix"
    
    #Instance Reduction class
    pca = Reduction(join, config['pca_comp'])
    pca_data = pca.perform_pca()
    print >> sys.stderr, "Performing PCA"
    
    #Instance Clustering class
    cluster = Clustering(pca_data)
    clust_obj = cluster.plot()
    print >> sys.stderr, "Performing clustering plot"
    
    #Instance ClusterReport class
    report = ClusterReport(clust_obj)
    file_name, querySeq = report.output_queryseq()
    print >> sys.stderr, "Doing report of clusters"

    #Instance Validate class
    valid = Validate(file_name, args.fasta_file,wDir)
    jfileComp, jfileMinus = valid.roundTwo()
    print >> sys.stderr, "Validation of results"
    
    #Instance ParseJplace Class
    parsing = ParseJplace(jfileComp, jfileMinus)
    corrMat = parsing.correlation()
    print >> sys.stderr, "Doing profiles"
    
    #Instance Profile Class
    ttest = Profiles(corrMat, querySeq)
    bestWin = ttest.windowsAssigment()
    print >>sys.stderr, "Doing permutations"
    
    #Instance StatsBinom
    finalResult = StatsBinom(args.fasta_file, config['win_length'],bestWin)
    finalResult.binomial()
    
    cleaning(file_name)
    def classify(self, nl_query):
        #pos_tree = tagger.to_tree(nl_query)

        tagged_yield = tagger.tagged_labeled_yield(nl_query)
        pos_tree = []
        for i in tagged_yield:
            pos_tree.append(i['ValueAnnotation'])
        pos_tree = " ".join(pos_tree)

        _, labels, trees = Preprocessing.data()
        text_clf = Pipeline([ ('vect', CountVectorizer(min_n=1, max_n=1)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC()) ])
        _ = text_clf.fit(trees, labels)
        predicted = text_clf.predict([pos_tree])[0]
        predicted = Preprocessing.query(predicted)
        return predicted
class Frequencies():
    """
    Compute term frequencies (Top 100).
    """
    def __init__(self, path, limit):
        self.path = path
        self.limit = limit
        
        self.preprocessing = Preprocessing()
    
    def countFreq(self):
        t0 = time()
        data = self.preprocessing.read_with_numpy(self.path, self.limit)
        print("Number of loaded Tweets: " + str(len(data)) + " - loaded and preprocessed in %0.3fs" % (time()-t0))
        print()
        
        print("Count frequencies", end=": ")
        t0 = time()
        # Use a dictionary to keep track of the frequency of tokens.
        token_counter = {}
          
        # Get a line/tweet from the corpus.
        for line in data:
 
            # Split the text on whitespace to get a list of words.
            for word in line.split():
 
                # Counting with a dictionary the EAFP way.
                # Try to add one to value of a key in the dictionary.
                try: 
                    token_counter[word] += 1
 
                # If the dictionary raises a KeyError,
                # add the key to the dictionary and set its value to 1.
                except KeyError:
                    token_counter[word] = 1
 
        # Sort the dictionary by frequency.
        frequency_list = sorted(token_counter.iteritems(), key=lambda x: x[1],
                reverse=True)
     
        # Print the 100 most frequent tokens.
#         print("Most frequent tokens (top 100):")
#         index = 1
#         for pair in frequency_list[:100]:
#             print(str(index).ljust(2), pair[0].ljust(20), str(pair[1]))
#             index += 1
     
        # Print the 100 least frequent tokens.
        print("\nLeast common tokens (top 100):")
        index = 1
        for pair in frequency_list[-10000:]:
            if pair[1] < 11:
                print(str(index).ljust(2), pair[0].ljust(20), str(pair[1]))
                index += 1
        
        print("done in %0.3fs" % (time()-t0))
        
def generalize_sql(query_list):
    skeletons = []

    for query in query_list:

        skeleton = Preprocessing.to_skeleton(query)
        skeletons.append(skeleton)

    return skeletons
Exemple #8
0
    def getPreprocessed(self):
        preprocessing = Preprocessing()
        # postprocessing = Postprocessing()

        frame = self.cam.get_frame()
        pre_options = preprocessing.options
        # Apply preprocessing methods toggled in the UI
        preprocessed = preprocessing.run(frame, pre_options)

        height, width, channels = frame.shape

        # model_positions, regular_positions = self.vision.locate(frame)
        # model_positions = postprocessing.analyze(model_positions)

        #	print model_positions
        # frame_h = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
        # print frame[(464-128), 111]

        return preprocessed
 def test(self):
     _, labels, trees = Preprocessing.data()
     test_labels = labels[2::3]
     test_trees = trees[2::3]
     labels = labels[0::3] + labels[1::3]
     trees = trees[0::3] + trees[1::3]
     text_clf = Pipeline([ ('vect', CountVectorizer(min_n=2, max_n=3)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', MultinomialNB()) ])
     # text_clf = Pipeline([ ('vect', CountVectorizer(min_n=1, max_n=1)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC()) ])
     _ = text_clf.fit(trees, labels)
     predicted = text_clf.predict(test_trees)
     accuracy = np.mean(predicted == test_labels)
     return accuracy
Exemple #10
0
	def create(self, corpus_path, model_type="morphosyntactic"):
		preprocessor = Preprocessing()
		vectorizer = VectorQuantization()
		document_list = []
		with codecs.open(corpus_path, 'r', 'utf-8') as corpus:
			line = corpus.readline()
			while line:
				comment = preprocessor.preprocessing(line.split('/|/')[1])
				category = line.split('/|/')[2].split('\n')[0]
				if model_type == "morphosyntactic":
					comment_vector = vectorizer.morphosyntactic_vector(comment)
				elif model_type == "bigram":
					comment_vector = vectorizer.bigram_vector(comment)
				else:
					print "No model defined using default: morphosyntactic"
					comment_vector = vectorizer.morphosyntactic_vector(comment)
				if comment_vector:
					document_list.append(pattern_Document(comment_vector, 
														  type=category))
				line = corpus.readline()
		model = pattern_Model(documents=document_list, weight=None)
		return model
    def preprocess(self):
        """ Preprocess the suspicious and source document. """
        susp_fp = codecs.open(self.susp, 'r', 'utf-8')
        self.susp_text = susp_fp.read()
        self.susp_bow = Preprocessing.tokenize(self.susp_text, self.susp_offsets, self.susp_sents)
        Preprocessing.ss_treat(self.susp_bow, self.susp_offsets, self.min_sentlen, self.rssent)
        susp_fp.close()

        src_fp = codecs.open(self.src, 'r', 'utf-8')
        self.src_text = src_fp.read()
        self.src_bow = Preprocessing.tokenize(self.src_text, self.src_offsets, self.src_sents)
        Preprocessing.ss_treat(self.src_bow, self.src_offsets, self.min_sentlen, self.rssent)
        src_fp.close()
Exemple #12
0
	def dumps_scale_test(self):
		instance=Preprocessing(self.config)
		instance.output_dumps_scale()
		pass
Exemple #13
0
def one_map(df, y_col):
    cols = df.columns
    # # ['bkg_volumes_original', 'bkg_volumes_after', 'bkg_surfaces',
    # 'bkg_size_z', 'bkg_size_x', 'bkg_size_y', 'bkg_max_value',
    # 'bkg_sum_value', 'bkg_var_value', 'a_volumes', 'a_max_value',
    # 'a_sum_value', 'a_var_value', 'b_volumes', 'b_max_value', 'b_sum_value',
    # 'b_var_value', 'c_volumes', 'c_max_value', 'c_sum_value',
    # 'c_var_value']

    bkg_props = [[
        'bkg_volumes_after', 'bkg_surfaces', 'bkg_size_z', 'bkg_size_x',
        'bkg_size_y', 'bkg_max_value', 'bkg_sum_value', 'bkg_var_value'
    ]]
    bkg_prop_combs = list(combinations(bkg_props, 2))
    print(bkg_prop_combs)

    all_X_cols = [
        ['bkg_volumes_after', 'bkg_surfaces'],
        ['bkg_size_z', 'bkg_size_x', 'bkg_size_y'],
        ['bkg_max_value', 'bkg_sum_value', 'bkg_var_value'],
        [
            'bkg_volumes_after', 'bkg_surfaces', 'bkg_size_z', 'bkg_size_x',
            'bkg_size_y'
        ],
        [
            'bkg_size_z', 'bkg_size_x', 'bkg_size_y', 'bkg_max_value',
            'bkg_sum_value', 'bkg_var_value'
        ],
        [
            'bkg_volumes_after', 'bkg_surfaces', 'bkg_size_z', 'bkg_size_x',
            'bkg_size_y', 'bkg_max_value', 'bkg_sum_value', 'bkg_var_value'
        ],
    ]

    all_X_cols += bkg_props + bkg_prop_combs

    df = df[df[y_col] != "All nan"]  # copy.copy()
    print(df[df[y_col] == "All nan"])

    filter_particles = [
        k for k in df.index
        if "particle_-1" not in k and "particle_1617" not in k
    ]
    # pd.to_numeric(df[y_col], errors='coerce').isnull().index

    # #
    # abc_props = ["bkg_var_value", "a_volumes", "a_max_value", "a_sum_value"
    # 	"a_var_value", "b_volumes", "b_max_value", "b_sum_value",
    # 	"b_var_value", "c_volumes", "c_max_value", "c_sum_value",
    # 	"c_var_value"]
    # filter_particles = df.eq(df.loc[:, 0], axis=0).all(1)
    for X_cols in all_X_cols:
        X = np.array(df.loc[filter_particles, X_cols].values)
        y = np.array(df.loc[filter_particles, y_col].values.ravel())

        print("X.shape before: ", X.shape)
        X, y = filter_array(X, y)
        print("X.shape after: ", X.shape)

        # # normalize
        scaler, X_norm = normalize(X)

        n_particles = X_norm.shape[0]
        scale = 300

        mkl_methods = []  # "MLKR", "LFDA"
        dimreduc_methods = ["tsne", "mds", "isomap"]
        methods = mkl_methods + dimreduc_methods

        for method in methods:
            if method in mkl_methods:
                model = EmbeddingSpace(embedding_method=method)
                model.fit(X_train=X_norm, y_train=y)
                X_trans = model.transform(X_val=X_norm, get_min_dist=False)

            if method in dimreduc_methods:
                print("X shape", X.shape)
                print("y shape", y.shape)

                scaler, Xy_norm = normalize(np.c_[X, y])  #

                model = Preprocessing(similarity_matrix=Xy_norm)

                if method == "tsne":
                    X_trans, _ = model.tsne(n_components=2,
                                            perplexity=20,
                                            early_exaggeration=200,
                                            learning_rate=200.0,
                                            n_iter=1000,
                                            n_iter_without_progress=300,
                                            min_grad_norm=1e-07,
                                            metric='euclidean',
                                            init='random',
                                            verbose=0,
                                            random_state=None,
                                            method='barnes_hut',
                                            angle=0.5,
                                            n_jobs=None)
                if method == "isomap":
                    X_trans, _ = model.iso_map(n_neighbors=int(n_particles /
                                                               scale),
                                               n_components=2,
                                               eigen_solver='auto',
                                               tol=0,
                                               max_iter=None,
                                               path_method='auto',
                                               neighbors_algorithm='auto',
                                               n_jobs=None)
                if method == "mds":
                    X_trans, _ = model.mds(n_components=2,
                                           metric=True,
                                           n_init=4,
                                           max_iter=300,
                                           verbose=0,
                                           eps=0.001,
                                           n_jobs=None,
                                           random_state=None,
                                           dissimilarity='euclidean')

            save_at = summary_folder + "/{0}/{1}/comb_{2}/{3}/dens.pdf".format(
                y_col, method, len(X_cols), "|".join(X_cols))

            print("Save at:", save_at)
            title = save_at.replace(ResultDir, "").replace("/", "\n")

            x = X_trans[:, 0]
            y = X_trans[:, 1]
            xlabel = "{0} dim 1".format(method)
            ylabel = "{0} dim 2".format(method)
            joint_plot_2(x=x,
                         y=y,
                         xlabel=xlabel,
                         ylabel=ylabel,
                         xlim=(min(x) - 0.1, max(x) + 0.1),
                         ylim=(min(y) - 0.1, max(y) + 0.1),
                         title=title,
                         save_at=save_at)

            scatter_plot_4(x=x,
                           y=y,
                           color_array=None,
                           xvlines=None,
                           yhlines=None,
                           sigma=None,
                           mode='scatter',
                           lbl=None,
                           name=None,
                           s=30,
                           alphas=0.6,
                           title=title,
                           x_label='x',
                           y_label='y',
                           save_file=save_at.replace("dens.pdf",
                                                     "scatter.pdf"),
                           interpolate=False,
                           color='blue',
                           preset_ax=None,
                           linestyle='-.',
                           marker='o')
Exemple #14
0
class Dictionary:
    def __init__(self):
        self.redis_handler = redis.Redis(db=1, decode_responses=True)
        self.preprocessing = Preprocessing()

        self.prepared_dic = dict()
        self.prepared_bigram = dict()
        self.prepared_lencat = dict()

        self.bigram_prefix = 'bigram_'
        self.lencat_prefix = 'lencat_'

        self.DB_DICTIONARY = 'dic_exists'
        self.DB_BIGRAM = 'bigram_exists'
        self.DB_LENCAT = 'lencat_exists'
        self.DB_COMMON = 'common_exists'

        self.CASE_UPPER = '1'
        self.CASE_LOWER = '2'
        self.CASE_BOTH = '0'

    def words_really_different(self, main_word, lemma_word):
        pattern = "^{}(es|s)?$".format(lemma_word.lower())
        try:
            if (not re.match(r"^[a-zA-Z]+$", main_word)):
                return False

            if (re.match(pattern, main_word.lower())):
                return False
        except re.error as e:
            print("{}: {}".format(pattern, main_word.lower()))
            raise Exception(str(e))

        return True

    def database_exists(self, keyword):
        return True if self.get_single_word_from_dic(keyword) == '1' else False

    def prepare_word2dic(self, main_word, root_word):
        word2store = root_word.lower()

        prev_case = self.prepared_dic[
            word2store] if word2store in self.prepared_dic else None
        current_case = self.get_word_case(main_word, prev_case=prev_case)
        self.prepared_dic[word2store] = current_case

        if (self.words_really_different(main_word, root_word)):
            self.prepared_dic[main_word.lower()] = current_case

        self.prepare_lencat2dic(main_word, root_word)

    def prepare_lencat2dic(self, main_word, root_word):
        word1 = root_word.lower()
        word2 = main_word.lower()

        lencat_index = "{}{}".format(self.lencat_prefix, len(word1))
        if (lencat_index in self.prepared_lencat):
            self.prepared_lencat[lencat_index].add(word1)
        else:
            self.prepared_lencat[lencat_index] = {word1}

        if (word1 != word2):
            lencat_index = "{}{}".format(self.lencat_prefix, len(word2))
            if (lencat_index in self.prepared_lencat):
                self.prepared_lencat[lencat_index].add(word2)
            else:
                self.prepared_lencat[lencat_index] = {word2}

    def prepare_bigram2dic(self, word, prev_word):
        word2look = "{}{}".format(self.bigram_prefix, word[0].lower())
        word_pos = word[0] if self.preprocessing.is_customized_word(
            word[0]) else word[1]
        prev_w = None if prev_word[0] == None else prev_word[0]

        if (prev_word[0] == None):
            prev_p = None
        elif (self.preprocessing.is_customized_word(prev_word[0])):
            prev_p = prev_word[0]
        else:
            prev_p = prev_word[1]

        if (word2look in self.prepared_bigram):
            self.prepared_bigram[word2look]['pos'].add(word_pos.lower())
            self.prepared_bigram[word2look]['frequency'] += 1

            if (prev_w != None):
                self.prepared_bigram[word2look]['prev_words'].add(
                    prev_w.lower())

            if (prev_p != None):
                self.prepared_bigram[word2look]['prev_pos'].add(prev_p.lower())
        else:
            self.prepared_bigram[word2look] = {
                'pos': {word_pos.lower()},
                'frequency': 1,
                'prev_words': set() if prev_w == None else {prev_w.lower()},
                'prev_pos': set() if prev_p == None else {prev_p.lower()}
            }

    def store_prepared_data(self):
        result = False
        set_dbs = set()

        if (len(self.prepared_dic) > 0):
            if (self.redis_handler.mset(self.prepared_dic)):
                result = True
                set_dbs.add(self.DB_DICTIONARY)
        if (len(self.prepared_bigram) > 0):
            with self.redis_handler.pipeline() as pipe:
                for word, data in self.prepared_bigram.items():
                    try:
                        pipe.set("{}_frequency".format(word),
                                 data['frequency'])
                        if (len(data['pos']) > 0):
                            pipe.sadd("{}_pos".format(word), *data['pos'])
                        if (len(data['prev_words']) > 0):
                            pipe.sadd("{}_prev_words".format(word),
                                      *data['prev_words'])
                        if (len(data['prev_pos']) > 0):
                            pipe.sadd("{}_prev_pos".format(word),
                                      *data['prev_pos'])
                    except TypeError as e:
                        print(str(e))
                        print("{}: {}".format(word, data))
                        return

                pipe_result = pipe.execute()

            if (not False in pipe_result):
                result = True
                set_dbs.add(self.DB_BIGRAM)
        if (len(self.prepared_lencat) > 0):

            with self.redis_handler.pipeline() as pipe:
                for index, words in self.prepared_lencat.items():
                    pipe.sadd(index, *words)

                pipe_result = pipe.execute()

            if (not False in pipe_result):
                result = True
                set_dbs.add(self.DB_LENCAT)
            else:
                print("{} => {}".format(self.DB_BIGRAM, pipe_result))

        if (result):
            with self.redis_handler.pipeline() as pipe:
                for db in set_dbs:
                    self.redis_handler.set(db, '1')

                pipe.execute()

        self.prepared_dic = dict()
        self.prepared_bigram = dict()
        self.prepared_lencat = dict()

        return result

    def add_single_word2dic(self, main_word, root_word):
        word2store = root_word.lower()
        value = self.get_single_word_from_dic(root_word)
        word_type = self.get_word_case(main_word, prev_case=value)

        with self.redis_handler.pipeline() as pipe:
            pipe.set(word2store, word_type)
            pipe.sadd("{}{}".format(self.lencat_prefix, len(word2store)),
                      word2store)
            if (self.words_really_different(main_word, root_word)):
                pipe.set(main_word.lower(), word_type)
                pipe.sadd("{}{}".format(self.lencat_prefix, len(main_word)),
                          main_word.lower())

            pipe.execute()

        return word_type

    def get_single_word_from_dic(self,
                                 word2look,
                                 bigram=False,
                                 postfix=None,
                                 type_set=False):
        word = word2look.lower() if not bigram else "{}{}_{}".format(
            self.bigram_prefix, word2look.lower(), postfix)

        word = self.redis_handler.get(
            word) if not type_set else self.redis_handler.smembers(word)

        if (word != None and type(word) is not set):
            return word
        elif (word != None and type(word) is set and len(word) > 0):
            return set([term for term in word if term != None])

        return None

    def add_single_word2bigram(self, word, prev_word):
        word2look = "{}{}".format(self.bigram_prefix, word[0].lower())
        word_pos = word[0] if self.preprocessing.is_customized_word(
            word[0]) else word[1]
        prev_w = None if prev_word[0] == None else prev_word[0]

        if (prev_w == None):
            prev_p = None
        elif (self.preprocessing.is_customized_word(prev_w)):
            prev_p = prev_w
        else:
            prev_p = prev_word[1]

        with self.redis_handler.pipeline() as pipe:
            if (self.get_single_word_from_dic("{}_frequency".format(word2look))
                    != None):
                pipe.incr("{}_frequency".format(word2look))
            else:
                pipe.set("{}_frequency".format(word2look), 1)

            pipe.sadd("{}_pos".format(word2look), *{word_pos.lower()})

            if (prev_w != None):
                pipe.sadd("{}_prev_words".format(word2look), *{prev_w.lower()})

            if (prev_p != None):
                pipe.sadd("{}_prev_pos".format(word2look), *{prev_p.lower()})

            print(word)

            pipe.execute()

    def get_single_word_from_bigram(self, word):
        frequency = self.get_single_word_from_dic(word,
                                                  bigram=True,
                                                  postfix='frequency')

        if (frequency != None):
            return {
                'pos':
                self.get_single_word_from_dic(word,
                                              bigram=True,
                                              postfix='pos',
                                              type_set=True),
                'frequency':
                frequency,
                'prev_words':
                self.get_single_word_from_dic(word,
                                              bigram=True,
                                              postfix='prev_words',
                                              type_set=True),
                'prev_pos':
                self.get_single_word_from_dic(word,
                                              bigram=True,
                                              postfix='prev_pos',
                                              type_set=True)
            }

        return None

    def get_word_case(self, word, prev_case=None):
        if (prev_case == None):
            return self.CASE_UPPER if word.isupper() else self.CASE_LOWER
        elif (prev_case == self.CASE_BOTH):
            return self.CASE_BOTH
        else:
            current_case = self.get_word_case(word)
            return self.CASE_BOTH if current_case != prev_case else current_case

    def get_words_by_length(self, length_list):
        pipe_result = []

        with self.redis_handler.pipeline() as pipe:
            for index in length_list:
                pipe.smembers("{}{}".format(self.lencat_prefix, index))

            pipe_result = pipe.execute()

        return pipe_result

    def store_common_words(self, words):
        if (not self.database_exists(self.DB_COMMON)):
            common_words = set()

            for word in words:
                for cw in word:
                    break
                common_words.add(cw)

            if (len(common_words) > 0 and self.redis_handler.sadd(
                    'common_words', *common_words)):
                self.redis_handler.set(self.DB_COMMON, '1')
            for i in range(9):
                x_array.append(" ")
                y_array.append(" ")
                l_array.append(" ")                

            accuracy_total_accumulation = 0
            precision_total_accumulation = 0
            recall_total_accumulation = 0
            fmeasure_total_accumulation = 0

            for i in range(len(data_train)):
                kfold_per_combination.append(i+1)
                y_test = []
                y_pred = []

                prepro = Preprocessing()
                cleaned_data, terms = prepro.preprocessing(data_train[i]["tweet"])
                
                tbrs = TermBasedRandomSampling(X=x, Y=y, L=l)
                stopwords = tbrs.create_stopwords(cleaned_data,terms)

                prepro2 = Preprocessing()
                new_cleaned_data, new_terms = prepro2.remove_stopword(cleaned_data, stopwords)

                weight = Weighting(new_cleaned_data, new_terms)
                tfidf = weight.get_tf_idf_weighting()
                idf = weight.get_idf()

                nb = NBMultinomial()
                nb.fit(new_cleaned_data,new_terms,data_train[i]["target"],stopwords,idf,tfidf)
                
Exemple #16
0
def lstm_model_headline_body_combin(body_length, numb_epoch):
    fexc = Preprocessing()
    data = load_data()

    # Loading train data from files
    data.set_path(path='fnc-1-master')
    train_stance_data = data.get_headline_body_stance()
    train_bodies_data = data.get_body_id_text()
    train_headlines, train_bodies, train_stances = data.get_mapped_id_body(
        train_stance_data, train_bodies_data)

    # Removing punctuation and stop words from the headline and body of train data
    train_headlines_cl = fexc.get_clean_data(train_headlines)
    train_bodies_cl = fexc.get_clean_data(train_bodies)
    train_stances_cl = fexc.get_clean_data(train_stances)

    # Convert labels to integer
    train_stances_in = fexc.convert_lable_int(train_stances_cl)

    # Load the test data
    data.set_name("test")
    test_stance_data = data.get_headline_body_stance()
    test_bodies_data = data.get_body_id_text()
    test_headlines, test_bodies = data.get_mapped_id_body(test_stance_data,
                                                          test_bodies_data,
                                                          data_type="test")

    # Removing punctuation and stop words from the headline and body of test data
    test_headlines_cl = fexc.get_clean_data(test_headlines)
    test_bodies_cl = fexc.get_clean_data(test_bodies)

    # Remove Stop words #
    test_headlines_cl = fexc.remove_stop_words_list(test_headlines_cl)
    test_bodies_cl = fexc.remove_stop_words_list(test_bodies_cl)

    # Set the tokenizer
    alltext = train_headlines_cl + train_bodies_cl + test_headlines_cl + test_bodies_cl
    token = Tokenizer(num_words=30000)
    token.fit_on_texts(alltext)
    print('Number of Unique words: ' + str(len(token.word_index.keys())))

    # Combine the headline and bodies of training data
    train_data = fexc.combine_heading_body(train_headlines_cl, train_bodies_cl)
    word_index = token.word_index

    # Converting train data to sequence
    train_data = token.texts_to_sequences(train_data)

    # Padding train data
    train_data = pad_sequences(train_data,
                               maxlen=(MAX_HEADLINE_LENGTH + int(body_length)))

    # Converting the labels to one hot encoder
    onehotencoder = OneHotEncoder()
    train_stances_in = onehotencoder.fit_transform(train_stances_in).toarray()

    # Splitting the data in train and validation
    train_data, val_data, train_stances_final, stances_val = \
        train_test_split(train_data, train_stances_in, test_size=0.2, random_state=42)

    # Combining test data
    test_data = fexc.combine_heading_body(test_headlines_cl, test_bodies_cl)

    # Converting test data to sequence
    test_data = token.texts_to_sequences(test_data)

    # Padding test data
    test_data = pad_sequences(test_data,
                              maxlen=MAX_HEADLINE_LENGTH + int(body_length))

    # Getting embedding index
    embeddings_index = models.get_embeddings_index(GLOVE_DIR)

    print('Found %s word vectors.' % len(embeddings_index))

    # Getting embedding matrix
    embedding_matrix = models.get_embedding_matrix(
        embedding_dim=EMBEDDING_DIM,
        embeddings_index=embeddings_index,
        word_index=word_index)

    # Building the Model
    fake_nn = models.lstm_with_combine_headline_body(
        headline_length=MAX_HEADLINE_LENGTH,
        body_length=int(body_length),
        embedding_dim=EMBEDDING_DIM,
        word_index=word_index,
        embedding_matrix=embedding_matrix,
        activation='relu',
        drop_out=0.5,
        numb_layers=300,
        cells=200)

    # Early stopping and model checkpoint
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    bst_model_path = 'Fake_news_nlp.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path,
                                       save_best_only=True,
                                       save_weights_only=True)

    # Fitting the model
    fake_hist = fake_nn.fit(train_data,
                            train_stances_final,
                            batch_size=128,
                            epochs=int(numb_epoch),
                            shuffle=True,
                            validation_data=(val_data, stances_val),
                            callbacks=[early_stopping, model_checkpoint])

    # Storing the training and validation accuracy and loss in file for plot
    lstm_data = []
    with open(
            os.path.join(
                OBJECT_DUMP,
                "lstm_headline_body_combine" + str(body_length) + ".txt"),
            'wb') as bow_hist:
        lstm_data.append(fake_hist.history['acc'])
        lstm_data.append(fake_hist.history['val_acc'])
        lstm_data.append(fake_hist.history['loss'])
        lstm_data.append(fake_hist.history['val_loss'])
        pickle.dump(lstm_data, bow_hist)

    # Predict the labels for test data
    result = fake_nn.predict([test_data], batch_size=128)

    # Store the results in the result file
    result_str = fexc.convert_lable_string(result)
    with io.open(TEST_FILE, mode='r', encoding='utf8') as read_file:
        test_stance = csv.DictReader(read_file)
        with io.open(RESULT_FILE + "_" + str(body_length) + ".csv",
                     mode='w',
                     encoding='utf8') as write_file:
            writer = csv.DictWriter(
                write_file, fieldnames=['Headline', 'Body ID', 'Stance'])
            writer.writeheader()
            for sample, prediction in zip(test_stance, result_str):
                writer.writerow({
                    'Body ID': sample['Body ID'],
                    'Headline': sample['Headline'],
                    'Stance': prediction
                })

            # Print the Accuracy, competition score and confusion matrix
            print_result("fnc-1-master/competition_test_stances.csv",
                         RESULT_FILE + "_" + str(body_length) + ".csv")
Exemple #17
0
    def preprocessing(self,
                      doPreprocessing,
                      doFeatureSelection,
                      take_feature,
                      threshold,
                      progress,
                      qc,
                      label=None):
        features = None
        if self.con != None:
            if self.training_table:
                if label != None:
                    label.setText("Getting data training ...")
                self.dataTraining = self.con.getDataAsDF(self.training_table)
                progress.setValue(10)
                if self.dataTraining is not None:
                    p = Preprocessing(con=self.con)
                    oritext = None
                    uniqFeature = []
                    features = {}
                    originalFeatureCount = 0
                    progressP = 10
                    progressS = (70 - progressP) / len(self.dataTraining.index)
                    if label != None:
                        label.setText("Preprocessing data training ...")
                    for index, row in self.dataTraining.iterrows():
                        text = row[self.text_col]

                        if doPreprocessing:
                            pretext = p.process(text)
                            oritext = pretext['oritext']
                            pretext = pretext['stemmed_text']
                        else:
                            pretext = p.processNoPre(text)

                        t = p.processNoPre(pretext).split(
                            " ")  # bad performance
                        uniqFeature.extend(t)  # bad performance

                        # print("Ori : ",text)
                        # print("Preprocessed : ",pretext," -> ",row[self.class_col])
                        self.dataTraining.at[index, self.text_col] = pretext
                        progressP += progressS
                        progress.setValue(progressP)
                        # time.sleep(0.5)
                        qc.processEvents()
                    progress.setValue(70)
                    qc.processEvents()
                    uniqFeature = set(uniqFeature)  # bad performance
                    qc.processEvents()
                    features['featurebefore'] = len(
                        uniqFeature)  # bad performance
                    qc.processEvents()
                    progress.setValue(80)

                    features['vsm'] = self.builtVSM(doFeatureSelection,
                                                    take_feature,
                                                    threshold,
                                                    qc=qc,
                                                    label=label)
                    features['oritext'] = oritext
                    progress.setValue(90)
            else:
                print("No training table!")
        progress.setValue(100)

        return features
Exemple #18
0
        proba(term | topic) = beta[topic][term]

        We shall for each topic find the top 20 words that contribute 
        to a document being classified as said topic
        """
        top_20_per_topic = np.argsort(self.beta * (-1), axis=1)
        for i in range(self.nb_topics):
            for j in range(self.nb_terms):
                if top_20_per_topic[i][j] < 20:
                    print(self.index[j], end=" ")
            print()


if __name__ == "__main__":
    """
    Example of application using newsgroups
    """
    from sklearn.datasets import fetch_20newsgroups

    train = fetch_20newsgroups(subset='train',
                               remove=('headers', 'footers', 'quotes'))

    pp = Preprocessing()

    index, bow = pp.build_bow(pp.corpus_preproc(train["data"]))

    lda = LDA(5, bow, index, alpha=0.1, set_alpha=True)

    lda.estimation(max_iter_em=100, max_iter_var=10)

    lda.display_word_topic_association()
Exemple #19
0
	def load_location_json_test(self):
		instance=Preprocessing(self.config)
		#instance.load_location_json('UserInfo_2')
		instance.output_coor_scale()
Exemple #20
0
    #If '-specificity' flag is not set, default is in str format, if flag is set, it's in list format
    specificity = None
    if type(args.specificity) == str:
        specificity = args.specificity
    else:
        specificity = args.specificity[0]

    #'-result' flag cannot be more specific than '-specificity' flag 
    #ie: if you are searching by paragraph, how can you print sentences?
    if resultDir[args.results[0]] < specificityDir[specificity]:
        sys.exit('Search results specificity cannot be broader than search algorithm specificity, program terminating')

    #For each file in folder, reading and tokenizing text according to '-result' flag
    fileTokenTouplesResults = []
    for fileName in os.listdir(searchDocPath):
        fileObj = Preprocessing(resultDir[args.results[0]], fileName)
        fileObj.readFile()
        fileObj.tokenizeText()
        #Appending text and file info as tuple into list
        fileTokenTouplesResults.append((fileObj.tokenizedList, fileObj.fileName))

    equalBool = 0
    fileTokenTouplesSpecificity = []
    #If '-specificity' flag is the same as '-result' flag, reuse computed tuple list from '-result'
    if resultDir[args.results[0]] == specificityDir[specificity]:
        fileTokenTouplesSpecificity = fileTokenTouplesResults
        equalBool = 1
    #If they are different, read and tokenize text according to '-specificity' flag
    else: 
        for fileName in os.listdir(searchDocPath):
            fileObj = Preprocessing(specificityDir[specificity], fileName)
Exemple #21
0
import networkx as nx
import matplotlib.pyplot as plt
from community import community_louvain

from preprocessing import Preprocessing

preprocessingClass = Preprocessing("lastfm_similars.db")
nodes = preprocessingClass.get_all_nodes()
indexes = preprocessingClass.giving_indexes_to_tIds(nodes)
allData = preprocessingClass.getting_all_data(indexes)

trackIds = []

inFile = open('track_list.txt')
for line in inFile:
    fields = line.strip().split('<S>')
    trackIds.append(fields[0])
inFile.close()

G = nx.Graph()

print("getting the graph start")
for node in allData:
    if (allData[node]["tid"] in trackIds):
        for i in range(len(allData[node]["similars"])):
            G.add_edge(node,
                       allData[node]["similars"][i],
                       weight=allData[node]["weights"][i])

# nx.draw(G, pos=nx.circular_layout(G), node_color='r', edge_color='b')
# plt.show()
from preprocessing import Preprocessing
from segmentaion import Segmentation
from matplotlib import pyplot as plt
from skimage import io

import numpy as np
from skimage.feature import greycomatrix, greycoprops
from skimage import data
from skimage.color import rgb2gray

# you can make a loop to handling all images at once
preprocessing = Preprocessing()
preprocessing.preproces('ImageFile')
#preprocessing.preproces('C:/Users/Teja/Desktop/internship/project/braintumor/Cl/defect.jpg')
preprocessing.binarization()
preprocessing.removingSkul()
preprocessing.enhanceImage()
preprocessing.segmentation()
image = preprocessing.getInfectedRegion()

# ### Extract GLCM Texture  Features

# In[ ]:
im = io.imread(
    'C:/Users/Teja/Desktop/internship/project/braintumor/tmp/tumourImage.jpg')

# GLCM Texture Features
ds = []
cr = []
cn = []
am = []
Exemple #23
0
    def main():

        # create a logger
        logging = log.Logging(user_messages=False, timer_messages=True)

        # filter configurations
        filter_conf = fc.FilterConfiguration(logging)
        filter_conf.import_filters()

        # preprocessing configurations
        pp_config = pc.PreprocessingConfiguration(logging)
        pp_config.config()

        # save configurations
        save_config = sc.SaveConfiguration(logging)
        save_config.config()

        # tweepy
        if (filter_conf.tweepy):
            # Twitter connection
            twitter_conn = tt.TwitterAuthenticator(logging)
            twitter_conn.connect()

            # Twitter query
            tt_query_tweepy = qt.QueryTweets(twitter_conn,
                                             filter_conf.list_filters, True,
                                             logging)
            tt_query_tweepy.query_manager()

            # Twitter preprocessing
            preprocessing = pp.Preprocessing(pp_config,
                                             tt_query_tweepy.dict_df_posts,
                                             logging)
            preprocessing.preprocessing()

            # Twitter save
            save = sv.Save(save_config, tt_query_tweepy.dict_df_posts, logging)
            save.save()

        # twint
        if (filter_conf.twint):
            # Twitter query
            tt_query_twint = qt2.QueryTweetsV2(filter_conf.list_filters, True,
                                               logging)
            tt_query_twint.query_manager()

            # Twitter preprocessing
            preprocessing = pp.Preprocessing(pp_config,
                                             tt_query_twint.dict_df_posts,
                                             logging)
            preprocessing.preprocessing()

            # Twitter save
            save = sv.Save(save_config, tt_query_twint.dict_df_posts, logging)
            save.save()

        # praw
        if (filter_conf.praw):
            # Reddit connection
            reddit_conn = rd.RedditAuthenticator(logging)
            reddit_conn.connect()

            # Reddit query
            rt_query_praw = qr.QueryRedditPosts(reddit_conn,
                                                filter_conf.list_filters, True,
                                                logging)
            rt_query_praw.query_manager()

            # Reddit preprocessing
            preprocessing = pp.Preprocessing(pp_config,
                                             rt_query_praw.dict_df_posts,
                                             logging)
            preprocessing.preprocessing()

            # Reddit save
            save = sv.Save(save_config, rt_query_praw.dict_df_posts, logging)
            save.save()

        # pmaw
        if (filter_conf.pmaw):
            # Reddit query
            rt_query_pmaw = qr2.QueryRedditPostsV2(filter_conf.list_filters,
                                                   True, logging)
            rt_query_pmaw.query_manager()

            # Reddit preprocessing
            preprocessing = pp.Preprocessing(pp_config,
                                             rt_query_pmaw.dict_df_posts,
                                             logging)
            preprocessing.preprocessing()

            # Reddit save
            save = sv.Save(save_config, rt_query_pmaw.dict_df_posts, logging)
            save.save()
Exemple #24
0
import pandas as pd
import numpy as np
from knn import KNN
from classifier import Classifier
import matplotlib.pyplot as plt
import visualization
from sklearn.model_selection import train_test_split
from preprocessing import Dataset, Preprocessing
from decisiontree import DecisionTree

# -----------------------------------------------------------------------------
# 0. Preprocessing

# get peprocessed and cleaned dataframes
cancer_df, hepatitis_df = Preprocessing.get_preprocessed_datasets()
cancer_features, cancer_labels = Preprocessing.get_labels_features(cancer_df)
hepatitis_features, hepatitis_labels = Preprocessing.get_labels_features(
    hepatitis_df)

# Dataset 1 (Breast Cancer)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(cancer_features,
                                                            cancer_labels,
                                                            test_size=0.33)
# Dataset 2 (Hepatitis)
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(hepatitis_features,
                                                            hepatitis_labels,
                                                            test_size=0.33)

# -----------------------------------------------------------------------------
# 1. Compare the accuracy of KNN and Decision Tree algorithm on the two datasets.
class Geo():
    """
    Loads data, loads preprocessing and does country mapping.
    """
    def __init__(self):
        self.pp = Preprocessing()
    
    def getGeoTweets(self, path, limit):

        data = self.loadGeoData(path, limit)
        
        country_map = self.country_mapping(data)
    
        return data, country_map
    
           
    def loadGeoData(self, path, limit=None):
        
        counter = 0
        
        print("Loading data from " + path, end=" - ")
        t0 = time()
        
        data = numpy.loadtxt(path, dtype='str', delimiter="\t", usecols = [2,5,3,3,0], comments=None)

        print("done in %0.3fs" % (time() - t0))

        print("Preprocessing tweet texts", end=" - ")
        t0 = time()
        empty_row = []      
        for i in range(len(data)):
            
            counter+=1
            
            tweet = self.pp.preprocess_tweet(data[i][2])
            data[i][2] = tweet
            if tweet == "":
                empty_row.append(i)
            if limit and counter >= limit:
                if len(empty_row) > 1:
                    empty_row.sort(reverse=True)
                for id in empty_row:
                    data = numpy.delete(data, (id), axis=0)
                print("done in %0.3fs" % (time() - t0))
                print()

                return data[:limit]
            
        print("done in %0.3fs" % (time() - t0))
        print()
        if len(empty_row) > 1:
            empty_row.sort(reverse=True)

        for id in empty_row:
            data = numpy.delete(data, (id), axis=0)
        return data
    
    def country_mapping(self, data):
        """
        Mapping: country (key) - corresponding tweet ids (value list).
        """
        country_map = {}
        for i in range(len(data)):
            country = data[i][1]
            if country in country_map.keys():
                tmp = country_map[country]
                tmp.append(i)
                country_map[country] = tmp
            else:
                country_map[country] = [i]
        
        return country_map
 def __init__(self):
     self.pre_processing = Preprocessing()
     self.LOWER_GRAY_2 = np.array([0, 0, 100])
     self.UPPER_GRAY_2 = np.array([255, 80, 175])
Exemple #27
0
def preprocessing(file,originalSamplingRate=48000,originalChannels=2, sampleRate=48000, channels=2, alpha = 0.97,frameLength=20,windowType ="hanning"):
    prep = Preprocessing()
    prep.readSoundFile(file,sr=originalSamplingRate,n_channels=originalChannels)
    prep.channelConversion(channels)
    prep.resampling(sampleRate)
    prep.pre_emphasis(alpha)
    prep.framing(frameLength)
    prep.addWindow(windowType)
    prep.vad()
    return prep.frames
Exemple #28
0
	def get_location_test(self):
		instance=Preprocessing(self.config)
		instance.get_location()
Exemple #29
0
dataset = VideoLoader(
    args.csv,
    framerate=1 if args.type == '2d' else 24,
    size=224 if args.type == '2d' else 112,
    centercrop=(args.type == '3d'),
)
n_dataset = len(dataset)
sampler = RandomSequenceSampler(n_dataset, 10)
loader = DataLoader(
    dataset,
    batch_size=1,
    shuffle=False,
    num_workers=args.num_decoding_thread,
    sampler=sampler if n_dataset > 10 else None,
)
preprocess = Preprocessing(args.type)
model = get_model(args)
feat_root = args.feat_root
with th.no_grad():
    for k, data in enumerate(loader):
        input_file = data['input'][0]
        output_file = data['output'][0]
        output_path = os.path.join(feat_root, output_file)
        if len(data['video'].shape) > 3:
            print('Computing features of video {}/{}: {}'.format(
                k + 1, n_dataset, input_file))
            video = data['video'].squeeze()
            if len(video.shape) == 4:
                video = preprocess(video)
                n_chunk = len(video)
                features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0)
Exemple #30
0
from preprocessing import Preprocessing
from language import Language
from image import Image_modality
import util
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from keras.models import Model, Input
from sklearn.metrics import accuracy_score

if __name__ == '__main__':
	pre = Preprocessing()
	df_input = pre.data
	train_data = pre.train_data
	test_data = pre.test_data
	Y_train = pre.Y_train
	Y_test = pre.Y_test


	lng = Language(df_input)

	print("Preparing the language data")
	train_tokens = train_data['title'].apply(util.get_tokens)
	lng_data_train = lng.get_encoded_data(train_tokens)

	test_tokens = test_data['title'].apply(util.get_tokens)
	lng_data_test = lng.get_encoded_data(test_tokens)
	language_model = lng.lng_model
	print("training the language model (bi-lstm), this might take some time")
	language_model.fit(lng_data_train, Y_train, verbose=1, validation_split=0.2, nb_epoch=5)

	## printing precision_recall- language modality
Exemple #31
0
	def output_missing_scale_test(self):
		instance=Preprocessing(self.config)
		instance.output_missing_scale()
Exemple #32
0
    def processFrameOf(self, camera):
        if not camera.isUp():
            #self.log("WARN", "Video stream for Camera: " + camera._id + " not available")
            return False
        maxt = 10
        frame = None
        for i in range(1, maxt + 1):
            #self.log("INFO", "Trying to accesss frame {}/{}".format(i, maxt))
            try:
                ret, f = camera.read()
                if ret:
                    frame = f
            except:
                yyyyy = 1

        if frame is None:
            #self.log("WARN", "Couldn't access a valid frame")
            return False

        if camera._id in self.preprocessings:
            self.log("INFO", "Pre-Processing frame of camera: " + camera._id)
            st = time.time()
            lineCoords = [(5, frame.shape[0] - 30 * (i + 1)) for i in range(3)]
            pp = self.preprocessings[camera._id]
            if 'brightness' in pp:
                bv = pp['brightness']
                frame = Preprocessing.adjustBrightness(frame, bv)
                Preprocessing.putText(frame, "Brightness: " + str(bv),
                                      lineCoords[0])

            if 'sharpness' in pp:
                sv = pp['sharpness']
                frame = Preprocessing.sharpenImage(frame, k=sv)
                Preprocessing.putText(frame, "Sharpness: " + str(sv),
                                      lineCoords[1])

            if 'denoise' in pp:
                dv = pp['denoise']
                if dv > 0:
                    frame = Preprocessing.denoiseImage(frame, strength=dv)
                    Preprocessing.putText(frame, "denoise: " + str(dv),
                                          lineCoords[2])

            et = time.time()
            self.log("TIME", "Action took {:2.6f}s".format((et - st)))

        #person detection
        #plt.imshow(frame)
        self.log("INFO", "Detecting People in the frame")
        bboxes, conf = self.pd.detect(frame, drawOnFrame=False)
        #overlapping bounding boxes
        self.log("INFO", "Applying nms")
        bboxes = non_max_suppression(np.array(bboxes),
                                     probs=None,
                                     overlapThresh=0.65)
        #tracking
        if len(bboxes) > 0:

            tbboxes, tids = camera.tk.track(frame,
                                            bboxes,
                                            conf,
                                            drawOnFrame=False)
            if len(tbboxes) > 0:

                self.log("INFO", "Tracking people {}".format(len(tids)))
                for i in range(len(tbboxes)):
                    tbbox = np.array(tbboxes[i], np.int32)
                    tid = tids[i]
                    #increasing fps by selective recognition
                    if camera.track.hasPerson(tid):
                        if camera.track.people[tid].isSuspect():
                            if time.time() - camera.track.people[
                                    tid].whenRecognized < self.recognizeThresh:
                                continue

                    person = frame[tbbox[1]:tbbox[3], tbbox[0]:tbbox[2]]
                    #cv2.imshow("person: ", person)
                    faces = fdr.extractFaces(person, drawOnFrame=False)
                    if len(faces) <= 0:
                        continue

                    face = faces[0]
                    fe = fdr.getEmbedding(face[0])

                    #check if he/she is a suspect
                    suspectDetected = False
                    for k, suspect in self.suspects.items():
                        #{"face":face, "em":em, "path":path}
                        for pic in suspect.pictures:
                            em = pic['em']
                            if fdr.is_match(em, fe):
                                camera.track.suspectDetected(
                                    tid, suspect, time.time(), frame,
                                    self.SERVER_ID, camera._id)
                                suspectDetected = True
                                break
                        if suspectDetected:
                            break

                #update track
                camera.track.updatePositions(tbboxes, tids)

        camera.track.clearForgotten()
        #display bboxes and everything
        camera.track.draw(frame)
        #udpate the processedFrame
        #cv2.imshow("Frame", frame)

        t = time.localtime()
        text = "Server: " + time.strftime("%H:%M:%S", t)
        cv2.putText(frame, text, (10, 60), cv2.FONT_HERSHEY_COMPLEX, 0.5,
                    (0, 255, 255), 1)

        with self.lock:
            camera.processedFrame = frame
            camera.processedFrameTime = time.time()
            self.xo = 1
        return True
Exemple #33
0
class Summarization(object):
    def __init__(self,
                 lang_code,
                 method="LSA",
                 n_words=200,
                 k=1,
                 sv_threshold=0.5,
                 min_df=0,
                 max_df=.1,
                 use_idf=True):

        self.lang_code = lang_code
        self.method = method
        self.n_words = n_words
        self.k = k  # num topics
        self.sv_threshold = sv_threshold
        self.min_df = min_df
        self.max_df = max_df
        self.use_idf = use_idf
        self.valid_langs = ["en"]

        if self.lang_code in self.valid_langs:
            self.p = Preprocessing(lang_code=lang_code)
            self.tfidf = TfidfVectorizer(min_df=min_df,
                                         max_df=max_df,
                                         use_idf=use_idf)

    def generate_doc_term_matrix(self, norm_sents):
        """ Generate document term matrix from normalized sentences """
        dt_matrix = self.tfidf.fit_transform(norm_sents)
        dt_matrix = dt_matrix.toarray()
        return dt_matrix

    def generate_term_doc_matrix(self, dt_matrix):
        """ Generate term document matrix from document term matrix """
        td_matrix = dt_matrix.T
        return td_matrix

    def generate_summary(self, sents, top_sentence_indices):
        """ Generate summary from original sentences using top sentence indices """
        sents = np.array(sents)
        summary = "\n".join(sents[top_sentence_indices])
        return summary

    def summarize(self, text, n_sents=3):
        """ Summarize a given text and get top sentences """
        try:
            prediction = dict()

            if text:
                if self.lang_code in self.valid_langs:
                    if Utility.get_doc_length(text) > self.n_words:
                        # generate sentences, normalized sentences from text
                        sents, norm_sents = self.p.text_preprocessing(text)
                        # generate doc-term-matrix, term-doc-matrix
                        dt_matrix = self.generate_doc_term_matrix(norm_sents)
                        td_matrix = self.generate_term_doc_matrix(dt_matrix)

                        if self.method == "LSA":
                            lsa = LSA(self.k, td_matrix)
                            term_topic_matrix, singular_values, topic_doc_matrix = lsa.u, lsa.s, lsa.vt
                            # remove singular values below given treshold
                            singular_values = lsa.filter_singular_values(
                                singular_values, self.sv_threshold)
                            # get salience scores from top singular values & topic document matrix
                            salience_scores = lsa.get_salience_scores(
                                singular_values, topic_doc_matrix)
                            # get the top sentence indices for summarization
                            top_sentence_indices = lsa.get_top_sent_indices(
                                salience_scores, n_sents)
                            summary = self.generate_summary(
                                sents, top_sentence_indices)
                        elif self.method == "TEXT_RANK":
                            tr = TextRank(dt_matrix, td_matrix)
                            # build similarity graph
                            similarity_matrix = tr.similiarity_matrix
                            similarity_graph = tr.get_similarity_graph(
                                similarity_matrix)
                            # compute pagerank scores for all sentences
                            ranked_sents = tr.rank_sentences(similarity_graph)
                            # get the top sentence indices for summarization
                            top_sentence_indices = tr.get_top_sentence_indices(
                                ranked_sents, n_sents)
                            summary = self.generate_summary(
                                sents, top_sentence_indices)
                        else:
                            return "no method found"

                        # apply cleaning for readability
                        summary = Utility.remove_multiple_whitespaces(summary)
                        summary = Utility.remove_trailing_whitespaces(summary)
                        prediction["summary"] = summary
                        prediction["message"] = "successful"
                    else:
                        return "required at least {} words".format(
                            self.n_words)
                else:
                    return "language not supported".format()
            else:
                return "required textual content"
            return prediction
        except Exception:
            logging.error("exception occured", exc_info=True)
Exemple #34
0
class Model():
    """
    Model Class
    """
    def __init__(self):

        self.__utils = Utils()
        self.__preprocessing = Preprocessing()
        self.model_fit = self.fit()
        self.__preprocessing.execute()

    @property
    def dataset_preprocessed(self):
        """
        Returns
        -------
        DataFrame Houses Preprocessed

        """

        df = pd.read_csv("data/houses_clean.csv")

        df.drop('Unnamed: 0', axis=1, inplace=True)

        return df

    def get_prepared_df(self):
        """
       Prepare dataframe for modelling

       Parameters
       ----------
       df : Dataframe Data

       Returns
       -------
       Array to Model 

       """

        df = self.dataset_preprocessed

        df['size'] = df['size'].apply(lambda x: math.ceil(x / 5.0) * 5.0)

        #Property Type Union
        df['propertyType'] = np.where((df['propertyType'] == 'studio') |
                                      (df['propertyType'] == 'duplex'), 'flat',
                                      df['propertyType'])

        #Select Features
        df = df[[
            'price', 'size', 'propertyType', 'district', 'status', 'roomsCat',
            'bathroomsCat'
        ]]  #,'box_posto_auto','hasTerrace',
        #'hasGarden','hasSwimmingPool']]

        return df

    @property
    def labels_dataset(self):
        """
        Returns
        -------
        Labels Dataset Numpy Array

        """
        df = self.get_prepared_df()
        labels = np.array(df['price'])

        return labels

    @property
    def features_dataset(self):
        """
        Returns
        -------
        Features Dataset Numpy Array

        """
        df = self.get_prepared_df()
        features = df.drop('price', axis=1)
        features = np.array(features)

        return features

    @property
    def feat_tsf_dataset(self):
        """
        Returns
        -------
        Features Dataset Numpy Array with Category Encoders

        """
        features = self.features_dataset
        labels = self.labels_dataset

        #Encoder
        encoder = ce.GLMMEncoder(cols=self.cat_index)

        #Encoder Cv
        cv_encoder = NestedCVWrapper(feature_encoder=encoder,
                                     cv=5,
                                     shuffle=True,
                                     random_state=7)

        #Apply Transform to all datasets
        feat_tsf = cv_encoder.fit_transform(features, labels)

        return feat_tsf

    @property
    def features_list(self):
        """
        Returns
        -------
        Features List

        """
        df = self.get_prepared_df()
        features = df.drop('price', axis=1)
        # Saving feature names for later use
        feature_list = list(features.columns)

        return feature_list

    @property
    def n_features(self):
        """
        Returns
        -------
        Number of features

        """
        return len(self.features_list)

    @property
    def cat_index(self):
        """
        Returns
        -------
        Index position categorical columns

        """
        df = self.get_prepared_df()
        df.drop('price', axis=1, inplace=True)
        categorical_features_indices = np.where((df.dtypes != np.int)
                                                & (df.dtypes != np.float))[0]

        index = categorical_features_indices.reshape(1, -1).tolist()[0]

        return index

    def search_best_rf(self, n_trees=2500, saveStats=True):
        """
        Seach Best Random Forest Model
  
        Parameters
         ----------
        df : DataFrame prepared (method prepared_data)
  
        Returns
        -------
        JSON File (model_params_rf.json).
  
        """
        #Process Time
        start = time.time()

        #Datasets
        feat_tsf = self.feat_tsf_dataset
        labels = self.labels_dataset

        #Generate random state
        #min_samples_split_values to test
        max_features_list = np.arange(0.20, 0.66, 0.01).tolist()
        max_features_list = [round(elem, 2) for elem in max_features_list]

        max_features_list.append('sqrt')
        max_features_list.append('auto')

        #Get max n_trees
        max_n_trees = self.depth_of_trees.max()[0]
        max_depth_list = np.arange(int(max_n_trees / 4), max_n_trees,
                                   1).tolist()
        max_depth_list.append(None)

        #min_impurity_decrease
        min_impurity_decrease_list = np.arange(0.01, 0.26, 0.01).tolist()
        min_impurity_decrease_list = [
            round(elem, 2) for elem in min_impurity_decrease_list
        ]

        #min_samples_leaf_list.append(None)

        param_grid = {
            "max_features": max_features_list,
            "max_depth": max_depth_list,
            "min_impurity_decrease": min_impurity_decrease_list
        }

        #RF Model to test
        rf = RandomForestRegressor(bootstrap=True,
                                   oob_score=True,
                                   n_estimators=n_trees,
                                   random_state=7)

        #Define and execute pipe
        grid_cv = HalvingRandomSearchCV(estimator=rf,
                                        param_distributions=param_grid,
                                        random_state=7,
                                        max_resources='auto',
                                        verbose=3).fit(feat_tsf, labels)

        df_results = pd.DataFrame(grid_cv.cv_results_)

        #Save CV Results
        if saveStats:

            df_results.to_csv('data/cv_hyperparams_model.csv')

        print("Best Params:")
        print(grid_cv.best_params_)

        print("Saving model in 'model_params.joblib'")
        # Writing joblibfile with best model
        dump(grid_cv.best_estimator_, 'model_params.joblib')

        #Save json file with params best model
        json_txt = json.dumps(grid_cv.best_params_, indent=4)
        with open('model_params', 'w') as file:
            file.write(json_txt)

        #End Time
        end = time.time()
        time_elapsed = round((end - start) / 60, 1)

        return ('Time elapsed minutes: %1.f' % (time_elapsed))

    def fit(self):
        """
        Returns
        -------
        Fit Best Params Model

        """

        #Datasets
        feat_tsf = self.feat_tsf_dataset
        labels = self.labels_dataset

        #Open params
        with open('model_params', 'r') as file:
            params_model = json.load(file)

        #Model
        rf = RandomForestRegressor(**params_model)

        #Fit & Metrics
        rf.fit(feat_tsf, labels)

        oob_score = (rf.oob_score_) * 100

        print("OOB Score: %.2f" % oob_score)

        return rf

    @property
    def oob_score(self):
        """
        Returns
        -------
        Best Model OOB Score

        """

        return self.model_fit.oob_score_

    @property
    def params(self):
        """
        Returns
        -------
        Best Model Params

        """

        return self.model_fit.get_params()

    def predict(
        self,
        size,
        propertyType,
        district,
        status,
        rooms,
        bathrooms,
        #box_posto_auto,
        #hasGarden,
        #hasTerrace,
        #hasSwimmingPool
    ):
        """
        
        Parameters
        ----------
        district : str (category)
        status : str (category)
        rooms : int
        bathrooms : int
        box_posto_auto : Bool(1,0)
        garden : Bool(1,0)
        terrace : Bool(1,0)
        hasSwimmingPool : Bool(1,0)

        Returns
        -------
        Prediction : Best Model Prediction

        """
        """
        #Avg Price Zone
        avg_price_zone_df = self.dataset_preprocessed[['district','avgPriceZone']]

        avg_price_zone_df = avg_price_zone_df.drop_duplicates()       
        
        avgPriceZone = avg_price_zone_df.loc[
            avg_price_zone_df['district']==district]['avgPriceZone'].values[0]
        """

        #Rooms Category
        roomsCat = self.roomsCategory(rooms)

        #Bathrooms Logic
        bathroomsCat = self.bathroomsCategory(bathrooms)

        #Array for prediction
        array = np.array([
            size,
            propertyType,
            district,
            status,
            roomsCat,
            bathroomsCat,
            #box_posto_auto,
            #hasGarden,
            #hasTerrace,
            #hasSwimmingPool
        ]).reshape(1, -1)

        #Encoder
        encoder = ce.GLMMEncoder(cols=self.cat_index)

        #Encoder CV KFold
        cv_encoder = NestedCVWrapper(encoder,
                                     cv=5,
                                     shuffle=True,
                                     random_state=7)

        #Datasets
        features = self.features_dataset
        labels = self.labels_dataset

        #Apply Transform to all datasets
        feat_tsf = cv_encoder.fit_transform(features, labels, array)

        #Prediction
        prediction = self.model_fit.predict(feat_tsf[1])[0]

        return prediction

    @property
    def permutation_importance(self):
        """
        Permutation Features Importance 
        
        Returns
        -------
        
        Graph Permutation Importance
        """

        #Datasets
        feat_tsf = self.feat_tsf_dataset
        labels = self.labels_dataset

        rf = load('model_params.joblib')

        #Fit
        rf.fit(feat_tsf, labels)

        #Permutation importance
        result = permutation_importance(rf,
                                        feat_tsf,
                                        labels,
                                        n_repeats=10,
                                        random_state=7,
                                        n_jobs=2)

        df = (pd.DataFrame({
            "ft": self.features_list,
            'imp_mean': result.importances_mean,
            'imp_dsvt': result.importances_std
        }))

        df.sort_values(by='imp_mean', ascending=False, inplace=True)

        sorted_idx = result.importances_mean.argsort()
        fig, ax = plt.subplots()
        ax.boxplot(result.importances[sorted_idx].T,
                   vert=False,
                   labels=self.get_prepared_df().iloc[:,
                                                      1:].columns[sorted_idx])
        ax.set_title("Permutation Importances")
        fig.tight_layout()

        return plt.show()

    def plot_tree(self, tree_number=0):
        """
        Parameters
        ----------
        number : Int. Tree to plot. The default is 0.

        Returns
        -------
        Tree Image

        """
        model_rf = self.model_fit

        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 30), dpi=800)
        tree.plot_tree(model_rf.estimators_[tree_number],
                       feature_names=self.features_list,
                       class_names='price',
                       filled=True)
        fig.savefig('data/rf_individualtree.png')

        return fig

    def feature_imp(self):
        """
        Feature Importance Model Method

        Returns
        -------
        Dataframe with features Importance

        """
        df = (pd.DataFrame({
            "ft": self.features_list,
            'imp': self.model_fit.feature_importances_
        }))

        df.sort_values(by='imp', ascending=False, inplace=True)

        return df

    @property
    def depth_of_trees(self):
        """
        
        Returns
        -------
        Dataframe with Trees depth

        """

        #Get depth of trees
        max_depth_list = []

        rf = RandomForestRegressor(n_estimators=2500, max_features=0.35)

        feat_tsf = self.feat_tsf_dataset
        labels = self.labels_dataset

        rf.fit(feat_tsf, labels)

        for i in rf.estimators_:

            max_depth_list.append(i.get_depth())

        print("Max depht: %i trees" % max(max_depth_list))

        return pd.DataFrame(max_depth_list, columns=['trees'])

    def train_test_samples(self,
                           features,
                           labels,
                           test_size=0.20,
                           random_state=None):

        feat_tsf = self.feat_tsf_dataset
        labels = self.labels_dataset

        X_train, X_test, y_train, y_test = train_test_split(
            feat_tsf, labels, test_size=test_size, random_state=random_state)

        return X_train, X_test, y_train, y_test

    def avg_price_district(self, district):

        df = self.dataset_preprocessed

        df = df.groupby('district').mean()['priceByArea']

        return int(df.loc[df.index == district].values[0])

    @property
    def propertyTypeList(self):

        propertyTypelist = ['Flat', 'Attic', 'Villa', 'Country House']

        return propertyTypelist

    def propertyTypeConverter(self, propertyType):
        """
        Parameters
        ----------
        propertyType : Str  Selected Property Type.
    
        Returns
        -------
        Property Type str
    
        """
        #Options list
        propertyTypelist = self.propertyTypeList

        #Lower elements list
        propertyTypelist = [i.lower() for i in propertyTypelist]

        #Assertion
        assert propertyType.lower() in propertyTypelist

        #Default Value
        propertyTypeOutput = 'flat'

        if propertyType.lower() == 'Flat':

            propertyTypeOutput = 'flat'

        elif propertyType.lower() == 'Attic':

            propertyTypeOutput = 'penthouse'

        elif propertyType.lower() == 'Villa':

            propertyTypeOutput = 'villa'

        elif propertyType.lower() == 'CountryHouse':

            propertyTypeOutput = 'countryHouse'

        return propertyTypeOutput

    @property
    def statusList(self):

        status_it = ['To be restructured', 'Good', 'New Construction ']

        return status_it

    def roomsCategory(self, rooms):
        """
        Parameters
        ----------
        rooms : Int
            Rooms

        Returns
        -------
        Rooms Category

        """

        roomsCat = 1

        if rooms >= 4:
            roomsCat = 4
        else:
            roomsCat = rooms

        return roomsCat

    def bathroomsCategory(self, bathrooms):
        """
        Parameters
        ----------
        bathrooms : Int
            bathRooms

        Returns
        -------
        Barthooms Category

        """

        bathroomsCat = 1

        if bathrooms >= 2:
            bathroomsCat = 2
        else:
            bathroomsCat = bathrooms

        return bathroomsCat
Exemple #35
0
 def evalSentence(self, model, sentence):
     return self.classificator.classifyWithModel(
         model, sentence, Preprocessing(con=self.con))
Exemple #36
0
    def __init__(self):

        self.__utils = Utils()
        self.__preprocessing = Preprocessing()
        self.model_fit = self.fit()
        self.__preprocessing.execute()
        description=('Estimate the Gibbs energy of a reaction. For example,'
                     'the following calculates dGr0 for ATP hydrolysis '
                     'at pH 6: calc_dGr0.py --ph 6 "C00002 + C00001 = '
                     'C00008 + C00009"'))
    parser.add_argument('--ph', type=float, help='pH level', default=7.0)
    parser.add_argument('--i', type=float,
                        help='ionic strength in M',
                        default=0.1)
    parser.add_argument('reaction', type=str, help='reaction in KEGG notation')
    return parser


###############################################################################
parser = MakeParser()
args = parser.parse_args()

logging.getLogger().setLevel(logging.WARNING)

print 'pH: %.1f' % args.ph
print 'I: %.1f M' % args.i
print 'Reaction: ' + args.reaction

# parse the reaction
reaction = Reaction.parse_formula(args.reaction)

p = Preprocessing()

# use the preprocessing class to calculate the estimated dG0 and uncertainty
dG0_prime, U = p.dG0_prime(reaction, pH=args.ph, I=args.i)
print u'dGr0 = %.2f \u00B1 %.2f kJ/mol' % (dG0_prime[0, 0], U[0, 0])
        output_2 = Dropout(0.1)(output_2)
        output_2 = Dense(32, activation='relu')(output_2)
        output_2 = Dropout(0.1)(output_2)
        output = Dense(outputdim, activation='softmax')(output_2)
        model = Model(inputs=[
            input_scene, input_before_sents, input_sents, input_before_char
        ],
                      outputs=output)

        return model


if __name__ == '__main__':
    embedding_dim = 62

    preprocessing = Preprocessing()
    characters, paraid2scene, paraid2chars, paraid2sents, episodeid2paraid, para_number, word_dict = preprocessing.load_dataset(
    )
    _, id2char, _, id2word, char_number, word_number, embedding_matrix = preprocessing.encoding_reduction(
        characters, word_dict)

    X, Y, X_test_2, Y_test_2 = preprocessing.generate_X_Y_split_beforechar(
        characters, paraid2scene, paraid2chars, paraid2sents, episodeid2paraid,
        para_number, word_dict)
    print('data loaded')
    X_test_1 = [
        np.array(X[0][10000:20000]),
        np.array(X[1][10000:20000]),
        np.array(X[2][10000:20000]),
        np.array(X[3][10000:20000])
    ]
Exemple #39
0
    tuples = sorted(tuples, key=lambda x: x[0])
    plt.figure(figsize=(10, 10))
    key_color = map(lambda x: 1 if x.startswith('PRI') else 0, zip(*tuples)[1])
    colors = np.asarray(['c', 'g'])
    plt.barh(bottom=pos,
             width=zip(*tuples)[0],
             color=colors[key_color],
             edgecolor=None,
             alpha=0.7)
    plt.yticks(np.arange(1, max(pos) + 0.5), zip(*tuples)[1], fontsize='small')
    plt.xlabel('Correlation', fontsize='small')
    plt.title('Ranking by point biserial correlation - Features v. Class',
              fontsize='small')
    plt.savefig('../Graphs/pbc.png')
    plt.tight_layout()


if __name__ == "__main__":

    hd = HiggsData(path=settings.get('paths', 'path_data'), imputation=True)
    df = Preprocessing.remove_missing_values(hd.processed_input, np.NaN)
    b_processed = Preprocessing.get_features(df[df.Label == -1])
    s_processed = Preprocessing.get_features(df[df.Label == 1])

    labels = hd.raw_input.ix[df.index]['Label']
    df_features = Preprocessing.get_features(df)
    cols = df_features.columns

    fishers = plot_fishers_ratio(b_processed, s_processed)
    pbc = plot_feature_class_corr_matrix(df_features, labels, cols)
def train(ink_dir, lg_dir):
    """
    This function is used for training model

    :param ink_dir:
    :param lg_dir:
    :return:
    """
    lg_files = os.listdir(lg_dir)
    pre = Preprocessing()

    feature_matrix = []
    targets = []
    c = 0
    total = len(lg_files)
    for file in lg_files:
        print(file, total - c, c)
        symbols = {}

        with open(lg_dir + "/" + file) as f:
            for line in f:
                if line.startswith("O"):
                    filt_line = line.strip().split(",")
                    symbols[filt_line[1].strip()] = [
                        filt_line[2], filt_line[4:]
                    ]

        inkml_file = file.replace(".lg", ".inkml")

        with open(ink_dir + "/" + inkml_file) as f:
            soup = bs.BeautifulSoup(f, 'html.parser')
            for key in symbols:
                label = symbols[key][0]
                strokes = symbols[key][1]
                id_list = []
                X = []
                Y = []
                for id in strokes:
                    st_id = id.strip()
                    trace = soup.findAll("trace", {'id': st_id})

                    coords = trace[0].text.strip().split(",")
                    x = []
                    y = []
                    for coord in coords:
                        trace_parts = coord.strip().split(' ')
                        x.append(float(trace_parts[0]))
                        y.append(float(trace_parts[1]))

                    X.append(x)
                    Y.append(y)
                    id_list.append(st_id)
                X, Y = pre.dopreprocess(x=X, y=Y, parser=True)
                symbols[key] = Symbol(label=label, x=X, y=Y, stroke_id=id_list)

        # relations section
        with open(lg_dir + "/" + file) as f:
            for line in f:
                if line.startswith("EO"):
                    filt_line = line.strip().split(",")
                    sym1 = symbols[filt_line[1].strip()]
                    sym2 = symbols[filt_line[2].strip()]
                    relation = filt_line[3].strip()

                    writing_slope = sym1.writing_slope(sym2)
                    writing_curve = sym1.writing_curvature(sym2)
                    bb_dist = sym1.distance_between_box(sym2)
                    distance, horizontal_ofsset, vertical_distance = sym1.distance_between_average_centres(
                        sym2)
                    max_point_pair = sym1.maximal_point_distance(sym2)
                    feature_matrix.append([
                        writing_slope, writing_curve, bb_dist, distance,
                        horizontal_ofsset, vertical_distance, max_point_pair
                    ])
                    targets.append(relation)

        c += 1

    print("Shape of Training matrix")
    print(len(feature_matrix), "x", len(feature_matrix[0]))
    print("Unique labels : ", np.unique(targets))

    rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    rf.fit(X=feature_matrix, y=targets)
    joblib.dump(rf,
                "relation_classifier_bonus.pkl",
                protocol=pickle.HIGHEST_PROTOCOL)

    rf = joblib.load("relation_classifier_bonus.pkl")

    score = accuracy_score(y_true=targets,
                           y_pred=rf.predict(feature_matrix),
                           normalize=True)

    print("accuracy of model is :", (score * 100))
    def apply_preprocessing(self):
        data = self.get_data()
        data = Preprocessing().categorical_column_to_numerical(data)
        data = Preprocessing().normalize_numerical_columns(data)

        return data
Exemple #42
0
fmeasure_pos = []
acc_per_fold = []

f = open("stopword_tala.txt", "r")
stopwords = f.read().split()

fold = list(range(1, 11))

for i in range(len(data_train)):
    print("Fold ke " + str(i + 1))
    # print(len(data_train[i]["tweet"]))
    # print(len(data_test[i]["tweet"]))
    y_test = []
    y_pred = []
    # TAHAP PEMBUATAN STOPWORD
    prepro = Preprocessing()
    new_cleaned_data, new_terms = prepro.preprocessing(data_train[i]["tweet"],
                                                       stopwords=stopwords)

    # TAHAP PELATIHAN

    weight = Weighting(new_cleaned_data, new_terms)
    tfidf = weight.get_tf_idf_weighting()
    idf = weight.get_idf()

    nb = NBMultinomial()
    nb.fit(new_cleaned_data, new_terms, data_train[i]["target"], stopwords,
           idf, tfidf)

    for j in range(len(data_test[i]["tweet"])):
        print("Test ke " + str(j))
 def __init__(self):
     self.pp = Preprocessing()
Exemple #44
0
from preprocessing import Preprocessing
from model import UnetModel

data_path = '..'
"""
To train different classes
for buildings: class_dict = {1: 0}
for road:      class_dict = {3: 0}
for tracks:    class_dict = {4, 0}
"""
class_dict = {1:0}
Patch_size = 224
N_split = 15
n_classes = len(class_dict)
inp_shape = (Patch_size, Patch_size, 20)
preprocessor = Preprocessing(data_path, class_dict)
unet_model = UnetModel(inp_shape, n_classes)
unet_model.getModel(0.2)
print ("Model generated")

unet_model.compileModel()
print ("Model compiled")
epochs = 100
batch_size = 16

train_image_data_gen = preprocessor.imagePatchGenerator(batch_size)
val_image_data_gen = preprocessor.imagePatchGenerator(batch_size, val_data = True)
print("batch generators generated")

print("Training...")
trained_model = unet_model.train_generator(batch_size, 
Exemple #45
0
from keras import optimizers, losses, metrics
from preprocessing import Preprocessing
from simulation import Simulation
from spp import SpatialPyramidPooling, R2
import numpy as np
import matplotlib.pyplot as plt

# Parameters setting
num_of_cells = 2
num_of_CUEs = 2
num_of_D2Ds = (2, 3)
batch_size = 64
epochs = 10

# Get the image data format which Keras follows
image_data_format = Preprocessing.GetImageDataFormat()

# Get the input data and target data
input_data_list = [
    Preprocessing.GetInputData(num_of_cells, num_of_CUEs, i,
                               (2000, 8000, 10000), image_data_format)
    for i in num_of_D2Ds
]
target_data_list = [
    Preprocessing.GetTargetData(num_of_cells, num_of_CUEs, i,
                                (2000, 8000, 10000)) for i in num_of_D2Ds
]

# Reshape the input data
for index, input_data in enumerate(input_data_list):
    rows, cols, channels = Preprocessing.GetInputShape(input_data)
Exemple #46
0
	def output_category_num_test(self):
		instance=Preprocessing(self.config)
		instance.output_category_num_scale()
 def __init__(self, path, limit):
     self.path = path
     self.limit = limit
     
     self.preprocessing = Preprocessing()
Exemple #48
0
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from gensim.matutils import argsort
from gensim import corpora, models, similarities
from gensim.topic_coherence import segmentation, probability_estimation, direct_confirmation_measure, aggregation,indirect_confirmation_measure
from pprint import pprint
from collections import namedtuple
from sklearn.decomposition import LatentDirichletAllocation
from preprocessing import Preprocessing
from sklearn.model_selection import GridSearchCV
from custom_vectorizer import OwnCountVectorizer

# nltk.download('words')

preprocessing = Preprocessing()

def prepare_data(filename):
    ''' Load and prepare the data for topic modeling. '''
    print 'Loading dataset...'
    data = pd.read_csv(filename, encoding = 'utf-8')
    data = preprocessing.remove_null(data)
    print 'Dataset loaded'
    print 'Preparing text inputs...'
    texts = preprocessing.preprocess_text(texts)
    titles = preprocessing.preprocess_text(titles)
    titles = titles[0:len(texts)]
    text_input = concat_text_input(titles, texts)
    text_data = data['scraped_title'] + ' ' + data['scraped_content']
    return text_input, text_data
Exemple #49
0
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from preprocessing import Preprocessing
Preprocessing = Preprocessing()
from models import models
models = models()
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, auc, roc_curve, normalized_mutual_info_score


data = pd.read_csv('./data.csv',index_col=0)
data.drop(['patkey', 'index_date', 'MATCHID'], axis=1)
data['age_at_index'] = data['age_at_index']-5
data = Preprocessing.FeatureEncoding(data)
data = Preprocessing.MissingData(data)
data.to_csv('data_complete.csv')
data = pd.read_csv('./data_complete.csv',index_col=0)
#==========================================================================================
#After using the KNN to deal with missing data, count and plot the histogram of features
'''
print(data.loc[:,'Smoking_status'].value_counts())
print(data.loc[:,'BMI_group'].value_counts())
print(data.loc[:,'Alcohol_status'].value_counts())
    
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2.-0.2, 1.03*height, '%s' % int(height))
Exemple #50
0
    def save_trails(encoder, alg, scr, data, path):
        """
        pre_parameters= pre_encoder(encoder)[0]
        preprocessor =pre_encoder(encoder)[1]
        alg: Algorithm, a dict(), each one is a name of that function {'knn':[KNN(),knn_params]}
        scr: scorings, a dict(), each one is a function such as 'acc':ACC 
        data: a dict() of dataset {'dataname':[trainset, testset]}
        path = ['~/all_models','~/best_models'] , to decide first chart or second chart.
        """
        # parameters
        params = Preprocessing.pre_encoder(encoder)[0]
        preprocessor_ = Preprocessing.pre_encoder(encoder)[1]
        alg_name = list(alg)[0]
        data_name = list(data)[0]
        clsf = alg[alg_name][0]
        param = alg[alg_name][1]
        params.append(param)
        dataset = data[data_name][0]
        test_set = data[data_name][1]
        pipeline = Pipeline([('preprocessing', preprocessor_),
                             ('classifier', clsf)])
        X = dataset.drop(columns=['target'])
        y = dataset.target
        print(y.unique())
        if len(y.unique()) > 2:
            # this will transform y into 1,0 ndarray
            lb = LabelBinarizer().fit(y)
            y = lb.transform(y)

        record_scores = {}
        for i in range(len(list(scr))):
            score_name = list(scr)[i]
            score = scr[score_name]
            print("Dataset is : ", data_name.upper())
            record_scores[score_name] = {}
            score_details = {}
            X_train, X_val, y_train, y_val = train_test_split(X,
                                                              y,
                                                              test_size=0.2)
            clf = GridSearchCV(pipeline,
                               params,
                               scoring=score,
                               cv=5,
                               n_jobs=-1,
                               refit=True,
                               return_train_score=True,
                               verbose=True)
            clf.fit(X_train, y_train)
            # test dataset
            best_model = clf.best_estimator_
            X_test = test_set.drop(columns=['target'])
            y_test = test_set.target
            y_pred = best_model.predict(X_test)
            testset_score = (y_test, y_pred)
            score_details['testset_score'] = testset_score
            score_details['best_score'] = clf.best_score_
            score_details['results'] = clf.cv_results_

            # save models to path
            save_models = os.path.join(path[0], alg_name, data_name,
                                       score_name)
            save_best_model = os.path.join(path[1], alg_name, data_name,
                                           score_name)
            self.check_directory([save_best_model, save_models])
            joblib.dump(clf, os.path.join(save_models, 'all_models.pkl'))
            joblib.dump(best_model,
                        os.path.join(save_best_model, 'best_model.pkl'))
            print('All models scored in ' + score_name + ' saved in ',
                  save_models)
            print('Best model scored in ' + score_name + ' saved in ',
                  save_best_model)
            record_scores.update(score_details)
        print(record_scores)
        return record_scores
class Segmentation:

    def __init__(self):
        self.pre_processing = Preprocessing()
        self.LOWER_GRAY_2 = np.array([0, 0, 100])
        self.UPPER_GRAY_2 = np.array([255, 80, 175])

    def segment(self, image):
        mask, image_no_background = self.pre_processing.cut_out_backgound(image)
        image_hue = self.pre_processing.get_mask_brightness(image_no_background)
        #cv2.imshow("img_hue", image_no_background)
        mat_points = self.map_out(mask, image_no_background)
        image = self.pre_processing.equalize_clahe(image)
        return mat_points, image#image_no_background

    def map_out(self, img_bin, image):
        mat_points = []
        contours, inheriters = cv2.findContours(img_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        for c in contours:
            moments = cv2.moments(c)
            if moments['m00'] > 100:
                x = []
                y = []
                for i in c:
                    for j in i:
                        x.append(j[0])
                        y.append(j[1])
                max_x, min_x, max_y, min_y = np.argmax(x), np.argmin(x), np.argmax(y), np.argmin(y)
                mat_points.append((x[min_x], y[min_y], x[max_x], y[max_y]))
        return mat_points

    def get_points_min_max(self, array):
        x, y = cv2.split(array)
        min_x = min(x)[0]
        x = list(x)
        pos_min_x = x.index(min_x)
        min_y = y[pos_min_x][0]
        max_x = max(x)[0]
        pos_max_x = x.index(max_x)
        max_y = y[pos_max_x][0]
        return (min_x, min_y),(max_x, max_y)

    def highlight_smoke_contours(self, image):
        mask = self.pre_processing.enhance_color(self.LOWER_GRAY_2, self.UPPER_GRAY_2, image)
        contours, cany = self.pre_processing.border_image(mask, image)
        i = 0
        sub_mats = []
        for c in contours:
            extension = cv2.contourArea(c)
            if extension > 600:
                contour = contours[i]
                (x_min, y_min), (x_max, y_max) = self.get_points_min_max(contour)
                sub_mats.append((x_min, y_min, x_max, y_max))
                self.pre_processing.draw_image(image, 1, [(x_min, y_min), (x_max, y_max)], (255,0,0))
                self.pre_processing.draw_image(image, 2, contour, (0,255,0))
                self.pre_processing.draw_image(cany, 2, contour,(0,255,0))
            i = i + 1
        return image, cany, sub_mats
Exemple #52
0
def perfectly_segmented_parser(ink_dir, bonus=False):
    """
    This is a parser for perfectly segmented symbols
    :param ink_dir: inkml directory
    :param bonus: boolean for bonus
    :return:
    """
    start = time.time()

    lg_dir = dir.strip().split("/")[0] + "_output_lg"

    if not os.path.exists(lg_dir):
        os.mkdir(lg_dir)

    ink_files = os.listdir(ink_dir)

    if bonus:
        print("Loaded Bonus classifier")
        clf = joblib.load("relation_classifier_bonus.pkl")
    else:
        print("Loaded relationship classifier")
        clf = joblib.load('relation_classifier4.pkl')
    pre = Preprocessing()
    total = len(ink_files)
    c = 0
    gt_c = 0

    for file in ink_files:
        print("Processing file : ", file, " Files remaining : ", total - c,
              " Files completed : ", c)

        f = open(os.path.join(ink_dir, file))

        soup = bs.BeautifulSoup(f, 'html.parser')
        trace_groups = soup.find_all('tracegroup')
        symbol_list = []

        #loop to isolate symbols
        for tracegroup in trace_groups[1:]:
            traceview = tracegroup.find_all('traceview')
            trace_id = []

            #loop to get strokes in a single symbol
            for t in traceview:
                trace_id.append(t['tracedataref'])

            gt = tracegroup.annotation.text
            gt_c += 1
            X = []
            Y = []

            #extract stroke coordinates
            for id in trace_id:
                traces = soup.findAll("trace", {'id': id})
                for trace in traces:
                    coords = trace.text.strip().split(",")
                    x = []
                    y = []
                    for coord in coords:
                        trace_parts = coord.strip().split(' ')
                        x.append(float(trace_parts[0]))
                        y.append(float(trace_parts[1]))

                    X.append(x)
                    Y.append(y)

            X, Y = pre.dopreprocess(x=X, y=Y, parser=True)
            if gt == ",":
                gt = "COMMA"
            sym_obj = Symbol(x=X, y=Y, label=gt, stroke_id=trace_id)
            symbol_list.append(sym_obj)

        symbol_count = {}

        #Run through list of symbols to get their count
        for sym in symbol_list:
            if sym.symbol not in symbol_count:
                symbol_count[sym.symbol] = 1
                sym.sym_ct = symbol_count[sym.symbol]
            else:
                symbol_count[sym.symbol] += 1
                sym.sym_ct = symbol_count[sym.symbol]

        #perform line of sight
        graph, labels = line_of_sight(symbol_list, clf)
        #run edmonds on los graph
        relations = edmonds(graph)

        #write result to lg
        write_to_lg(file=file,
                    symbol_list=symbol_list,
                    relations=relations,
                    labels=labels,
                    lg_dir=lg_dir)

        c += 1
    print("System executed in ", (time.time() - start) / 60, " minutes.")
Exemple #53
0
import metrics_reduced
from preprocessing import Preprocessing
import codecs
from collections import Counter
from constants import *

s = metrics_reduced.SpanishTools()
p = Preprocessing()

with codecs.open('corpus/Version_2_classes/corpus_2_classes.txt', 'r', 'utf-8') as file:
	features = {}
	line = file.readline()
	while line:
		line_bigram_list = s.n_grams(p.preprocessing(line.split('/|/')[1]))
		for e in line_bigram_list:
			if e in features:
				features[e] += 1
			else:
				features[e] = 1
		line = file.readline()


with codecs.open('bigram_features_filtered.txt', 'w', 'utf-8') as file:
	counter = Counter(features)
	common = counter.most_common(10000)
	temp_list=[]
	for k in common:
		for e in k[0].split():
			if e in positive_words or e in negative_words:
				temp_list.append(k[0])
Exemple #54
0
	def get_city_rank_test(self):
		instance=Preprocessing(self.config)
		instance.output_city_rank()