def classify_comment(self, comment, classifier_type='SVM', no_classes=5): sentiment = None preprocessor = Preprocessing() vectorizer = VectorQuantization() if classifier_type == 'SVM': preprocessed_comment = preprocessor.preprocessing(comment) comment_vector = vectorizer.morphosyntactic_vector(preprocessed_comment) sentiment = classifier_svm.classify(comment_vector) if no_classes == 5: return sentiment elif no_classes == 3: if sentiment == u'positivo' or sentiment == u'muy_positivo': return u'positivo' elif sentiment == u'negativo' or sentiment == u'muy_negativo': return u'negativo' else: return sentiment elif classifier_type == 'MNB': preprocessed_comment = preprocessor.preprocessing(comment) comment_vector = vectorizer.bigram_vector(preprocessed_comment) sentiment = classifier_mnb.classify(comment_vector) if no_classes == 5: return sentiment elif no_classes == 3: if sentiment == u'positivo' or sentiment == u'muy_positivo': return u'positivo' elif sentiment == u'negativo' or sentiment == u'muy_negativo': return u'negativo' else: return sentiment
def preprocess_tweets(self): """ Process tweets according to mode and set arrays """ processObject = Preprocessing(self.mode, self.tweets) processObject.preprocess_tweets() if ( "stem" in self.mode): self.stemmed_tweets_array = processObject.stemmed_tweets_array if ( "token" in self.mode): self.tokenized_tweets_array = processObject.tokenized_tweets_array if ( "pos" in self.mode): self.pos_tweets_array = processObject.pos_tweets_array if ( "lemma" in self.mode): self.lemmatized_tweets_array = processObject.lemmatized_tweets_array
def preprocess_tweets(self, mode, tweets_dict, filename): """ Process tweets according to mode and set arrays """ processObject = Preprocessing(mode, tweets_dict, filename) processObject.preprocess_tweets() if "stem" in mode: self.stemmed_tweets_array = processObject.stemmed_tweets_array if "token" in mode: self.tokenized_tweets_array = processObject.tokenized_tweets_array if "pos" in mode: self.pos_tweets_array = processObject.pos_tweets_array if "lemma" in mode: self.lemmatized_tweets_array = processObject.lemmatized_tweets_array
def main(args, config): wDir = os.getcwd() #Instance Preprocessing class window = Preprocessing(args.fasta_file, config['win_length'], config['win_step']) window.output_window() print >> sys.stderr, "Creating windows_sequence.fasta" #Instance Similarity and Composition class sim = Similarity(args.fasta_file, config['score_adj'],wDir) sim_matrix = sim.mcl_perform() comp_results = Composition(config['kmer_len']) comp_matrix = comp_results.joined() #Join similarity and composition matrix for PCA join = pd.concat([comp_matrix, sim_matrix], axis= 1, join='inner') print >> sys.stderr, "Calculating similarity and composition matrix" #Instance Reduction class pca = Reduction(join, config['pca_comp']) pca_data = pca.perform_pca() print >> sys.stderr, "Performing PCA" #Instance Clustering class cluster = Clustering(pca_data) clust_obj = cluster.plot() print >> sys.stderr, "Performing clustering plot" #Instance ClusterReport class report = ClusterReport(clust_obj) file_name, querySeq = report.output_queryseq() print >> sys.stderr, "Doing report of clusters" #Instance Validate class valid = Validate(file_name, args.fasta_file,wDir) jfileComp, jfileMinus = valid.roundTwo() print >> sys.stderr, "Validation of results" #Instance ParseJplace Class parsing = ParseJplace(jfileComp, jfileMinus) corrMat = parsing.correlation() print >> sys.stderr, "Doing profiles" #Instance Profile Class ttest = Profiles(corrMat, querySeq) bestWin = ttest.windowsAssigment() print >>sys.stderr, "Doing permutations" #Instance StatsBinom finalResult = StatsBinom(args.fasta_file, config['win_length'],bestWin) finalResult.binomial() cleaning(file_name)
def classify(self, nl_query): #pos_tree = tagger.to_tree(nl_query) tagged_yield = tagger.tagged_labeled_yield(nl_query) pos_tree = [] for i in tagged_yield: pos_tree.append(i['ValueAnnotation']) pos_tree = " ".join(pos_tree) _, labels, trees = Preprocessing.data() text_clf = Pipeline([ ('vect', CountVectorizer(min_n=1, max_n=1)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC()) ]) _ = text_clf.fit(trees, labels) predicted = text_clf.predict([pos_tree])[0] predicted = Preprocessing.query(predicted) return predicted
class Frequencies(): """ Compute term frequencies (Top 100). """ def __init__(self, path, limit): self.path = path self.limit = limit self.preprocessing = Preprocessing() def countFreq(self): t0 = time() data = self.preprocessing.read_with_numpy(self.path, self.limit) print("Number of loaded Tweets: " + str(len(data)) + " - loaded and preprocessed in %0.3fs" % (time()-t0)) print() print("Count frequencies", end=": ") t0 = time() # Use a dictionary to keep track of the frequency of tokens. token_counter = {} # Get a line/tweet from the corpus. for line in data: # Split the text on whitespace to get a list of words. for word in line.split(): # Counting with a dictionary the EAFP way. # Try to add one to value of a key in the dictionary. try: token_counter[word] += 1 # If the dictionary raises a KeyError, # add the key to the dictionary and set its value to 1. except KeyError: token_counter[word] = 1 # Sort the dictionary by frequency. frequency_list = sorted(token_counter.iteritems(), key=lambda x: x[1], reverse=True) # Print the 100 most frequent tokens. # print("Most frequent tokens (top 100):") # index = 1 # for pair in frequency_list[:100]: # print(str(index).ljust(2), pair[0].ljust(20), str(pair[1])) # index += 1 # Print the 100 least frequent tokens. print("\nLeast common tokens (top 100):") index = 1 for pair in frequency_list[-10000:]: if pair[1] < 11: print(str(index).ljust(2), pair[0].ljust(20), str(pair[1])) index += 1 print("done in %0.3fs" % (time()-t0))
def generalize_sql(query_list): skeletons = [] for query in query_list: skeleton = Preprocessing.to_skeleton(query) skeletons.append(skeleton) return skeletons
def getPreprocessed(self): preprocessing = Preprocessing() # postprocessing = Postprocessing() frame = self.cam.get_frame() pre_options = preprocessing.options # Apply preprocessing methods toggled in the UI preprocessed = preprocessing.run(frame, pre_options) height, width, channels = frame.shape # model_positions, regular_positions = self.vision.locate(frame) # model_positions = postprocessing.analyze(model_positions) # print model_positions # frame_h = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) # print frame[(464-128), 111] return preprocessed
def test(self): _, labels, trees = Preprocessing.data() test_labels = labels[2::3] test_trees = trees[2::3] labels = labels[0::3] + labels[1::3] trees = trees[0::3] + trees[1::3] text_clf = Pipeline([ ('vect', CountVectorizer(min_n=2, max_n=3)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', MultinomialNB()) ]) # text_clf = Pipeline([ ('vect', CountVectorizer(min_n=1, max_n=1)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC()) ]) _ = text_clf.fit(trees, labels) predicted = text_clf.predict(test_trees) accuracy = np.mean(predicted == test_labels) return accuracy
def create(self, corpus_path, model_type="morphosyntactic"): preprocessor = Preprocessing() vectorizer = VectorQuantization() document_list = [] with codecs.open(corpus_path, 'r', 'utf-8') as corpus: line = corpus.readline() while line: comment = preprocessor.preprocessing(line.split('/|/')[1]) category = line.split('/|/')[2].split('\n')[0] if model_type == "morphosyntactic": comment_vector = vectorizer.morphosyntactic_vector(comment) elif model_type == "bigram": comment_vector = vectorizer.bigram_vector(comment) else: print "No model defined using default: morphosyntactic" comment_vector = vectorizer.morphosyntactic_vector(comment) if comment_vector: document_list.append(pattern_Document(comment_vector, type=category)) line = corpus.readline() model = pattern_Model(documents=document_list, weight=None) return model
def preprocess(self): """ Preprocess the suspicious and source document. """ susp_fp = codecs.open(self.susp, 'r', 'utf-8') self.susp_text = susp_fp.read() self.susp_bow = Preprocessing.tokenize(self.susp_text, self.susp_offsets, self.susp_sents) Preprocessing.ss_treat(self.susp_bow, self.susp_offsets, self.min_sentlen, self.rssent) susp_fp.close() src_fp = codecs.open(self.src, 'r', 'utf-8') self.src_text = src_fp.read() self.src_bow = Preprocessing.tokenize(self.src_text, self.src_offsets, self.src_sents) Preprocessing.ss_treat(self.src_bow, self.src_offsets, self.min_sentlen, self.rssent) src_fp.close()
def dumps_scale_test(self): instance=Preprocessing(self.config) instance.output_dumps_scale() pass
def one_map(df, y_col): cols = df.columns # # ['bkg_volumes_original', 'bkg_volumes_after', 'bkg_surfaces', # 'bkg_size_z', 'bkg_size_x', 'bkg_size_y', 'bkg_max_value', # 'bkg_sum_value', 'bkg_var_value', 'a_volumes', 'a_max_value', # 'a_sum_value', 'a_var_value', 'b_volumes', 'b_max_value', 'b_sum_value', # 'b_var_value', 'c_volumes', 'c_max_value', 'c_sum_value', # 'c_var_value'] bkg_props = [[ 'bkg_volumes_after', 'bkg_surfaces', 'bkg_size_z', 'bkg_size_x', 'bkg_size_y', 'bkg_max_value', 'bkg_sum_value', 'bkg_var_value' ]] bkg_prop_combs = list(combinations(bkg_props, 2)) print(bkg_prop_combs) all_X_cols = [ ['bkg_volumes_after', 'bkg_surfaces'], ['bkg_size_z', 'bkg_size_x', 'bkg_size_y'], ['bkg_max_value', 'bkg_sum_value', 'bkg_var_value'], [ 'bkg_volumes_after', 'bkg_surfaces', 'bkg_size_z', 'bkg_size_x', 'bkg_size_y' ], [ 'bkg_size_z', 'bkg_size_x', 'bkg_size_y', 'bkg_max_value', 'bkg_sum_value', 'bkg_var_value' ], [ 'bkg_volumes_after', 'bkg_surfaces', 'bkg_size_z', 'bkg_size_x', 'bkg_size_y', 'bkg_max_value', 'bkg_sum_value', 'bkg_var_value' ], ] all_X_cols += bkg_props + bkg_prop_combs df = df[df[y_col] != "All nan"] # copy.copy() print(df[df[y_col] == "All nan"]) filter_particles = [ k for k in df.index if "particle_-1" not in k and "particle_1617" not in k ] # pd.to_numeric(df[y_col], errors='coerce').isnull().index # # # abc_props = ["bkg_var_value", "a_volumes", "a_max_value", "a_sum_value" # "a_var_value", "b_volumes", "b_max_value", "b_sum_value", # "b_var_value", "c_volumes", "c_max_value", "c_sum_value", # "c_var_value"] # filter_particles = df.eq(df.loc[:, 0], axis=0).all(1) for X_cols in all_X_cols: X = np.array(df.loc[filter_particles, X_cols].values) y = np.array(df.loc[filter_particles, y_col].values.ravel()) print("X.shape before: ", X.shape) X, y = filter_array(X, y) print("X.shape after: ", X.shape) # # normalize scaler, X_norm = normalize(X) n_particles = X_norm.shape[0] scale = 300 mkl_methods = [] # "MLKR", "LFDA" dimreduc_methods = ["tsne", "mds", "isomap"] methods = mkl_methods + dimreduc_methods for method in methods: if method in mkl_methods: model = EmbeddingSpace(embedding_method=method) model.fit(X_train=X_norm, y_train=y) X_trans = model.transform(X_val=X_norm, get_min_dist=False) if method in dimreduc_methods: print("X shape", X.shape) print("y shape", y.shape) scaler, Xy_norm = normalize(np.c_[X, y]) # model = Preprocessing(similarity_matrix=Xy_norm) if method == "tsne": X_trans, _ = model.tsne(n_components=2, perplexity=20, early_exaggeration=200, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=None) if method == "isomap": X_trans, _ = model.iso_map(n_neighbors=int(n_particles / scale), n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto', n_jobs=None) if method == "mds": X_trans, _ = model.mds(n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, eps=0.001, n_jobs=None, random_state=None, dissimilarity='euclidean') save_at = summary_folder + "/{0}/{1}/comb_{2}/{3}/dens.pdf".format( y_col, method, len(X_cols), "|".join(X_cols)) print("Save at:", save_at) title = save_at.replace(ResultDir, "").replace("/", "\n") x = X_trans[:, 0] y = X_trans[:, 1] xlabel = "{0} dim 1".format(method) ylabel = "{0} dim 2".format(method) joint_plot_2(x=x, y=y, xlabel=xlabel, ylabel=ylabel, xlim=(min(x) - 0.1, max(x) + 0.1), ylim=(min(y) - 0.1, max(y) + 0.1), title=title, save_at=save_at) scatter_plot_4(x=x, y=y, color_array=None, xvlines=None, yhlines=None, sigma=None, mode='scatter', lbl=None, name=None, s=30, alphas=0.6, title=title, x_label='x', y_label='y', save_file=save_at.replace("dens.pdf", "scatter.pdf"), interpolate=False, color='blue', preset_ax=None, linestyle='-.', marker='o')
class Dictionary: def __init__(self): self.redis_handler = redis.Redis(db=1, decode_responses=True) self.preprocessing = Preprocessing() self.prepared_dic = dict() self.prepared_bigram = dict() self.prepared_lencat = dict() self.bigram_prefix = 'bigram_' self.lencat_prefix = 'lencat_' self.DB_DICTIONARY = 'dic_exists' self.DB_BIGRAM = 'bigram_exists' self.DB_LENCAT = 'lencat_exists' self.DB_COMMON = 'common_exists' self.CASE_UPPER = '1' self.CASE_LOWER = '2' self.CASE_BOTH = '0' def words_really_different(self, main_word, lemma_word): pattern = "^{}(es|s)?$".format(lemma_word.lower()) try: if (not re.match(r"^[a-zA-Z]+$", main_word)): return False if (re.match(pattern, main_word.lower())): return False except re.error as e: print("{}: {}".format(pattern, main_word.lower())) raise Exception(str(e)) return True def database_exists(self, keyword): return True if self.get_single_word_from_dic(keyword) == '1' else False def prepare_word2dic(self, main_word, root_word): word2store = root_word.lower() prev_case = self.prepared_dic[ word2store] if word2store in self.prepared_dic else None current_case = self.get_word_case(main_word, prev_case=prev_case) self.prepared_dic[word2store] = current_case if (self.words_really_different(main_word, root_word)): self.prepared_dic[main_word.lower()] = current_case self.prepare_lencat2dic(main_word, root_word) def prepare_lencat2dic(self, main_word, root_word): word1 = root_word.lower() word2 = main_word.lower() lencat_index = "{}{}".format(self.lencat_prefix, len(word1)) if (lencat_index in self.prepared_lencat): self.prepared_lencat[lencat_index].add(word1) else: self.prepared_lencat[lencat_index] = {word1} if (word1 != word2): lencat_index = "{}{}".format(self.lencat_prefix, len(word2)) if (lencat_index in self.prepared_lencat): self.prepared_lencat[lencat_index].add(word2) else: self.prepared_lencat[lencat_index] = {word2} def prepare_bigram2dic(self, word, prev_word): word2look = "{}{}".format(self.bigram_prefix, word[0].lower()) word_pos = word[0] if self.preprocessing.is_customized_word( word[0]) else word[1] prev_w = None if prev_word[0] == None else prev_word[0] if (prev_word[0] == None): prev_p = None elif (self.preprocessing.is_customized_word(prev_word[0])): prev_p = prev_word[0] else: prev_p = prev_word[1] if (word2look in self.prepared_bigram): self.prepared_bigram[word2look]['pos'].add(word_pos.lower()) self.prepared_bigram[word2look]['frequency'] += 1 if (prev_w != None): self.prepared_bigram[word2look]['prev_words'].add( prev_w.lower()) if (prev_p != None): self.prepared_bigram[word2look]['prev_pos'].add(prev_p.lower()) else: self.prepared_bigram[word2look] = { 'pos': {word_pos.lower()}, 'frequency': 1, 'prev_words': set() if prev_w == None else {prev_w.lower()}, 'prev_pos': set() if prev_p == None else {prev_p.lower()} } def store_prepared_data(self): result = False set_dbs = set() if (len(self.prepared_dic) > 0): if (self.redis_handler.mset(self.prepared_dic)): result = True set_dbs.add(self.DB_DICTIONARY) if (len(self.prepared_bigram) > 0): with self.redis_handler.pipeline() as pipe: for word, data in self.prepared_bigram.items(): try: pipe.set("{}_frequency".format(word), data['frequency']) if (len(data['pos']) > 0): pipe.sadd("{}_pos".format(word), *data['pos']) if (len(data['prev_words']) > 0): pipe.sadd("{}_prev_words".format(word), *data['prev_words']) if (len(data['prev_pos']) > 0): pipe.sadd("{}_prev_pos".format(word), *data['prev_pos']) except TypeError as e: print(str(e)) print("{}: {}".format(word, data)) return pipe_result = pipe.execute() if (not False in pipe_result): result = True set_dbs.add(self.DB_BIGRAM) if (len(self.prepared_lencat) > 0): with self.redis_handler.pipeline() as pipe: for index, words in self.prepared_lencat.items(): pipe.sadd(index, *words) pipe_result = pipe.execute() if (not False in pipe_result): result = True set_dbs.add(self.DB_LENCAT) else: print("{} => {}".format(self.DB_BIGRAM, pipe_result)) if (result): with self.redis_handler.pipeline() as pipe: for db in set_dbs: self.redis_handler.set(db, '1') pipe.execute() self.prepared_dic = dict() self.prepared_bigram = dict() self.prepared_lencat = dict() return result def add_single_word2dic(self, main_word, root_word): word2store = root_word.lower() value = self.get_single_word_from_dic(root_word) word_type = self.get_word_case(main_word, prev_case=value) with self.redis_handler.pipeline() as pipe: pipe.set(word2store, word_type) pipe.sadd("{}{}".format(self.lencat_prefix, len(word2store)), word2store) if (self.words_really_different(main_word, root_word)): pipe.set(main_word.lower(), word_type) pipe.sadd("{}{}".format(self.lencat_prefix, len(main_word)), main_word.lower()) pipe.execute() return word_type def get_single_word_from_dic(self, word2look, bigram=False, postfix=None, type_set=False): word = word2look.lower() if not bigram else "{}{}_{}".format( self.bigram_prefix, word2look.lower(), postfix) word = self.redis_handler.get( word) if not type_set else self.redis_handler.smembers(word) if (word != None and type(word) is not set): return word elif (word != None and type(word) is set and len(word) > 0): return set([term for term in word if term != None]) return None def add_single_word2bigram(self, word, prev_word): word2look = "{}{}".format(self.bigram_prefix, word[0].lower()) word_pos = word[0] if self.preprocessing.is_customized_word( word[0]) else word[1] prev_w = None if prev_word[0] == None else prev_word[0] if (prev_w == None): prev_p = None elif (self.preprocessing.is_customized_word(prev_w)): prev_p = prev_w else: prev_p = prev_word[1] with self.redis_handler.pipeline() as pipe: if (self.get_single_word_from_dic("{}_frequency".format(word2look)) != None): pipe.incr("{}_frequency".format(word2look)) else: pipe.set("{}_frequency".format(word2look), 1) pipe.sadd("{}_pos".format(word2look), *{word_pos.lower()}) if (prev_w != None): pipe.sadd("{}_prev_words".format(word2look), *{prev_w.lower()}) if (prev_p != None): pipe.sadd("{}_prev_pos".format(word2look), *{prev_p.lower()}) print(word) pipe.execute() def get_single_word_from_bigram(self, word): frequency = self.get_single_word_from_dic(word, bigram=True, postfix='frequency') if (frequency != None): return { 'pos': self.get_single_word_from_dic(word, bigram=True, postfix='pos', type_set=True), 'frequency': frequency, 'prev_words': self.get_single_word_from_dic(word, bigram=True, postfix='prev_words', type_set=True), 'prev_pos': self.get_single_word_from_dic(word, bigram=True, postfix='prev_pos', type_set=True) } return None def get_word_case(self, word, prev_case=None): if (prev_case == None): return self.CASE_UPPER if word.isupper() else self.CASE_LOWER elif (prev_case == self.CASE_BOTH): return self.CASE_BOTH else: current_case = self.get_word_case(word) return self.CASE_BOTH if current_case != prev_case else current_case def get_words_by_length(self, length_list): pipe_result = [] with self.redis_handler.pipeline() as pipe: for index in length_list: pipe.smembers("{}{}".format(self.lencat_prefix, index)) pipe_result = pipe.execute() return pipe_result def store_common_words(self, words): if (not self.database_exists(self.DB_COMMON)): common_words = set() for word in words: for cw in word: break common_words.add(cw) if (len(common_words) > 0 and self.redis_handler.sadd( 'common_words', *common_words)): self.redis_handler.set(self.DB_COMMON, '1')
for i in range(9): x_array.append(" ") y_array.append(" ") l_array.append(" ") accuracy_total_accumulation = 0 precision_total_accumulation = 0 recall_total_accumulation = 0 fmeasure_total_accumulation = 0 for i in range(len(data_train)): kfold_per_combination.append(i+1) y_test = [] y_pred = [] prepro = Preprocessing() cleaned_data, terms = prepro.preprocessing(data_train[i]["tweet"]) tbrs = TermBasedRandomSampling(X=x, Y=y, L=l) stopwords = tbrs.create_stopwords(cleaned_data,terms) prepro2 = Preprocessing() new_cleaned_data, new_terms = prepro2.remove_stopword(cleaned_data, stopwords) weight = Weighting(new_cleaned_data, new_terms) tfidf = weight.get_tf_idf_weighting() idf = weight.get_idf() nb = NBMultinomial() nb.fit(new_cleaned_data,new_terms,data_train[i]["target"],stopwords,idf,tfidf)
def lstm_model_headline_body_combin(body_length, numb_epoch): fexc = Preprocessing() data = load_data() # Loading train data from files data.set_path(path='fnc-1-master') train_stance_data = data.get_headline_body_stance() train_bodies_data = data.get_body_id_text() train_headlines, train_bodies, train_stances = data.get_mapped_id_body( train_stance_data, train_bodies_data) # Removing punctuation and stop words from the headline and body of train data train_headlines_cl = fexc.get_clean_data(train_headlines) train_bodies_cl = fexc.get_clean_data(train_bodies) train_stances_cl = fexc.get_clean_data(train_stances) # Convert labels to integer train_stances_in = fexc.convert_lable_int(train_stances_cl) # Load the test data data.set_name("test") test_stance_data = data.get_headline_body_stance() test_bodies_data = data.get_body_id_text() test_headlines, test_bodies = data.get_mapped_id_body(test_stance_data, test_bodies_data, data_type="test") # Removing punctuation and stop words from the headline and body of test data test_headlines_cl = fexc.get_clean_data(test_headlines) test_bodies_cl = fexc.get_clean_data(test_bodies) # Remove Stop words # test_headlines_cl = fexc.remove_stop_words_list(test_headlines_cl) test_bodies_cl = fexc.remove_stop_words_list(test_bodies_cl) # Set the tokenizer alltext = train_headlines_cl + train_bodies_cl + test_headlines_cl + test_bodies_cl token = Tokenizer(num_words=30000) token.fit_on_texts(alltext) print('Number of Unique words: ' + str(len(token.word_index.keys()))) # Combine the headline and bodies of training data train_data = fexc.combine_heading_body(train_headlines_cl, train_bodies_cl) word_index = token.word_index # Converting train data to sequence train_data = token.texts_to_sequences(train_data) # Padding train data train_data = pad_sequences(train_data, maxlen=(MAX_HEADLINE_LENGTH + int(body_length))) # Converting the labels to one hot encoder onehotencoder = OneHotEncoder() train_stances_in = onehotencoder.fit_transform(train_stances_in).toarray() # Splitting the data in train and validation train_data, val_data, train_stances_final, stances_val = \ train_test_split(train_data, train_stances_in, test_size=0.2, random_state=42) # Combining test data test_data = fexc.combine_heading_body(test_headlines_cl, test_bodies_cl) # Converting test data to sequence test_data = token.texts_to_sequences(test_data) # Padding test data test_data = pad_sequences(test_data, maxlen=MAX_HEADLINE_LENGTH + int(body_length)) # Getting embedding index embeddings_index = models.get_embeddings_index(GLOVE_DIR) print('Found %s word vectors.' % len(embeddings_index)) # Getting embedding matrix embedding_matrix = models.get_embedding_matrix( embedding_dim=EMBEDDING_DIM, embeddings_index=embeddings_index, word_index=word_index) # Building the Model fake_nn = models.lstm_with_combine_headline_body( headline_length=MAX_HEADLINE_LENGTH, body_length=int(body_length), embedding_dim=EMBEDDING_DIM, word_index=word_index, embedding_matrix=embedding_matrix, activation='relu', drop_out=0.5, numb_layers=300, cells=200) # Early stopping and model checkpoint early_stopping = EarlyStopping(monitor='val_loss', patience=10) bst_model_path = 'Fake_news_nlp.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) # Fitting the model fake_hist = fake_nn.fit(train_data, train_stances_final, batch_size=128, epochs=int(numb_epoch), shuffle=True, validation_data=(val_data, stances_val), callbacks=[early_stopping, model_checkpoint]) # Storing the training and validation accuracy and loss in file for plot lstm_data = [] with open( os.path.join( OBJECT_DUMP, "lstm_headline_body_combine" + str(body_length) + ".txt"), 'wb') as bow_hist: lstm_data.append(fake_hist.history['acc']) lstm_data.append(fake_hist.history['val_acc']) lstm_data.append(fake_hist.history['loss']) lstm_data.append(fake_hist.history['val_loss']) pickle.dump(lstm_data, bow_hist) # Predict the labels for test data result = fake_nn.predict([test_data], batch_size=128) # Store the results in the result file result_str = fexc.convert_lable_string(result) with io.open(TEST_FILE, mode='r', encoding='utf8') as read_file: test_stance = csv.DictReader(read_file) with io.open(RESULT_FILE + "_" + str(body_length) + ".csv", mode='w', encoding='utf8') as write_file: writer = csv.DictWriter( write_file, fieldnames=['Headline', 'Body ID', 'Stance']) writer.writeheader() for sample, prediction in zip(test_stance, result_str): writer.writerow({ 'Body ID': sample['Body ID'], 'Headline': sample['Headline'], 'Stance': prediction }) # Print the Accuracy, competition score and confusion matrix print_result("fnc-1-master/competition_test_stances.csv", RESULT_FILE + "_" + str(body_length) + ".csv")
def preprocessing(self, doPreprocessing, doFeatureSelection, take_feature, threshold, progress, qc, label=None): features = None if self.con != None: if self.training_table: if label != None: label.setText("Getting data training ...") self.dataTraining = self.con.getDataAsDF(self.training_table) progress.setValue(10) if self.dataTraining is not None: p = Preprocessing(con=self.con) oritext = None uniqFeature = [] features = {} originalFeatureCount = 0 progressP = 10 progressS = (70 - progressP) / len(self.dataTraining.index) if label != None: label.setText("Preprocessing data training ...") for index, row in self.dataTraining.iterrows(): text = row[self.text_col] if doPreprocessing: pretext = p.process(text) oritext = pretext['oritext'] pretext = pretext['stemmed_text'] else: pretext = p.processNoPre(text) t = p.processNoPre(pretext).split( " ") # bad performance uniqFeature.extend(t) # bad performance # print("Ori : ",text) # print("Preprocessed : ",pretext," -> ",row[self.class_col]) self.dataTraining.at[index, self.text_col] = pretext progressP += progressS progress.setValue(progressP) # time.sleep(0.5) qc.processEvents() progress.setValue(70) qc.processEvents() uniqFeature = set(uniqFeature) # bad performance qc.processEvents() features['featurebefore'] = len( uniqFeature) # bad performance qc.processEvents() progress.setValue(80) features['vsm'] = self.builtVSM(doFeatureSelection, take_feature, threshold, qc=qc, label=label) features['oritext'] = oritext progress.setValue(90) else: print("No training table!") progress.setValue(100) return features
proba(term | topic) = beta[topic][term] We shall for each topic find the top 20 words that contribute to a document being classified as said topic """ top_20_per_topic = np.argsort(self.beta * (-1), axis=1) for i in range(self.nb_topics): for j in range(self.nb_terms): if top_20_per_topic[i][j] < 20: print(self.index[j], end=" ") print() if __name__ == "__main__": """ Example of application using newsgroups """ from sklearn.datasets import fetch_20newsgroups train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) pp = Preprocessing() index, bow = pp.build_bow(pp.corpus_preproc(train["data"])) lda = LDA(5, bow, index, alpha=0.1, set_alpha=True) lda.estimation(max_iter_em=100, max_iter_var=10) lda.display_word_topic_association()
def load_location_json_test(self): instance=Preprocessing(self.config) #instance.load_location_json('UserInfo_2') instance.output_coor_scale()
#If '-specificity' flag is not set, default is in str format, if flag is set, it's in list format specificity = None if type(args.specificity) == str: specificity = args.specificity else: specificity = args.specificity[0] #'-result' flag cannot be more specific than '-specificity' flag #ie: if you are searching by paragraph, how can you print sentences? if resultDir[args.results[0]] < specificityDir[specificity]: sys.exit('Search results specificity cannot be broader than search algorithm specificity, program terminating') #For each file in folder, reading and tokenizing text according to '-result' flag fileTokenTouplesResults = [] for fileName in os.listdir(searchDocPath): fileObj = Preprocessing(resultDir[args.results[0]], fileName) fileObj.readFile() fileObj.tokenizeText() #Appending text and file info as tuple into list fileTokenTouplesResults.append((fileObj.tokenizedList, fileObj.fileName)) equalBool = 0 fileTokenTouplesSpecificity = [] #If '-specificity' flag is the same as '-result' flag, reuse computed tuple list from '-result' if resultDir[args.results[0]] == specificityDir[specificity]: fileTokenTouplesSpecificity = fileTokenTouplesResults equalBool = 1 #If they are different, read and tokenize text according to '-specificity' flag else: for fileName in os.listdir(searchDocPath): fileObj = Preprocessing(specificityDir[specificity], fileName)
import networkx as nx import matplotlib.pyplot as plt from community import community_louvain from preprocessing import Preprocessing preprocessingClass = Preprocessing("lastfm_similars.db") nodes = preprocessingClass.get_all_nodes() indexes = preprocessingClass.giving_indexes_to_tIds(nodes) allData = preprocessingClass.getting_all_data(indexes) trackIds = [] inFile = open('track_list.txt') for line in inFile: fields = line.strip().split('<S>') trackIds.append(fields[0]) inFile.close() G = nx.Graph() print("getting the graph start") for node in allData: if (allData[node]["tid"] in trackIds): for i in range(len(allData[node]["similars"])): G.add_edge(node, allData[node]["similars"][i], weight=allData[node]["weights"][i]) # nx.draw(G, pos=nx.circular_layout(G), node_color='r', edge_color='b') # plt.show()
from preprocessing import Preprocessing from segmentaion import Segmentation from matplotlib import pyplot as plt from skimage import io import numpy as np from skimage.feature import greycomatrix, greycoprops from skimage import data from skimage.color import rgb2gray # you can make a loop to handling all images at once preprocessing = Preprocessing() preprocessing.preproces('ImageFile') #preprocessing.preproces('C:/Users/Teja/Desktop/internship/project/braintumor/Cl/defect.jpg') preprocessing.binarization() preprocessing.removingSkul() preprocessing.enhanceImage() preprocessing.segmentation() image = preprocessing.getInfectedRegion() # ### Extract GLCM Texture Features # In[ ]: im = io.imread( 'C:/Users/Teja/Desktop/internship/project/braintumor/tmp/tumourImage.jpg') # GLCM Texture Features ds = [] cr = [] cn = [] am = []
def main(): # create a logger logging = log.Logging(user_messages=False, timer_messages=True) # filter configurations filter_conf = fc.FilterConfiguration(logging) filter_conf.import_filters() # preprocessing configurations pp_config = pc.PreprocessingConfiguration(logging) pp_config.config() # save configurations save_config = sc.SaveConfiguration(logging) save_config.config() # tweepy if (filter_conf.tweepy): # Twitter connection twitter_conn = tt.TwitterAuthenticator(logging) twitter_conn.connect() # Twitter query tt_query_tweepy = qt.QueryTweets(twitter_conn, filter_conf.list_filters, True, logging) tt_query_tweepy.query_manager() # Twitter preprocessing preprocessing = pp.Preprocessing(pp_config, tt_query_tweepy.dict_df_posts, logging) preprocessing.preprocessing() # Twitter save save = sv.Save(save_config, tt_query_tweepy.dict_df_posts, logging) save.save() # twint if (filter_conf.twint): # Twitter query tt_query_twint = qt2.QueryTweetsV2(filter_conf.list_filters, True, logging) tt_query_twint.query_manager() # Twitter preprocessing preprocessing = pp.Preprocessing(pp_config, tt_query_twint.dict_df_posts, logging) preprocessing.preprocessing() # Twitter save save = sv.Save(save_config, tt_query_twint.dict_df_posts, logging) save.save() # praw if (filter_conf.praw): # Reddit connection reddit_conn = rd.RedditAuthenticator(logging) reddit_conn.connect() # Reddit query rt_query_praw = qr.QueryRedditPosts(reddit_conn, filter_conf.list_filters, True, logging) rt_query_praw.query_manager() # Reddit preprocessing preprocessing = pp.Preprocessing(pp_config, rt_query_praw.dict_df_posts, logging) preprocessing.preprocessing() # Reddit save save = sv.Save(save_config, rt_query_praw.dict_df_posts, logging) save.save() # pmaw if (filter_conf.pmaw): # Reddit query rt_query_pmaw = qr2.QueryRedditPostsV2(filter_conf.list_filters, True, logging) rt_query_pmaw.query_manager() # Reddit preprocessing preprocessing = pp.Preprocessing(pp_config, rt_query_pmaw.dict_df_posts, logging) preprocessing.preprocessing() # Reddit save save = sv.Save(save_config, rt_query_pmaw.dict_df_posts, logging) save.save()
import pandas as pd import numpy as np from knn import KNN from classifier import Classifier import matplotlib.pyplot as plt import visualization from sklearn.model_selection import train_test_split from preprocessing import Dataset, Preprocessing from decisiontree import DecisionTree # ----------------------------------------------------------------------------- # 0. Preprocessing # get peprocessed and cleaned dataframes cancer_df, hepatitis_df = Preprocessing.get_preprocessed_datasets() cancer_features, cancer_labels = Preprocessing.get_labels_features(cancer_df) hepatitis_features, hepatitis_labels = Preprocessing.get_labels_features( hepatitis_df) # Dataset 1 (Breast Cancer) X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(cancer_features, cancer_labels, test_size=0.33) # Dataset 2 (Hepatitis) X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(hepatitis_features, hepatitis_labels, test_size=0.33) # ----------------------------------------------------------------------------- # 1. Compare the accuracy of KNN and Decision Tree algorithm on the two datasets.
class Geo(): """ Loads data, loads preprocessing and does country mapping. """ def __init__(self): self.pp = Preprocessing() def getGeoTweets(self, path, limit): data = self.loadGeoData(path, limit) country_map = self.country_mapping(data) return data, country_map def loadGeoData(self, path, limit=None): counter = 0 print("Loading data from " + path, end=" - ") t0 = time() data = numpy.loadtxt(path, dtype='str', delimiter="\t", usecols = [2,5,3,3,0], comments=None) print("done in %0.3fs" % (time() - t0)) print("Preprocessing tweet texts", end=" - ") t0 = time() empty_row = [] for i in range(len(data)): counter+=1 tweet = self.pp.preprocess_tweet(data[i][2]) data[i][2] = tweet if tweet == "": empty_row.append(i) if limit and counter >= limit: if len(empty_row) > 1: empty_row.sort(reverse=True) for id in empty_row: data = numpy.delete(data, (id), axis=0) print("done in %0.3fs" % (time() - t0)) print() return data[:limit] print("done in %0.3fs" % (time() - t0)) print() if len(empty_row) > 1: empty_row.sort(reverse=True) for id in empty_row: data = numpy.delete(data, (id), axis=0) return data def country_mapping(self, data): """ Mapping: country (key) - corresponding tweet ids (value list). """ country_map = {} for i in range(len(data)): country = data[i][1] if country in country_map.keys(): tmp = country_map[country] tmp.append(i) country_map[country] = tmp else: country_map[country] = [i] return country_map
def __init__(self): self.pre_processing = Preprocessing() self.LOWER_GRAY_2 = np.array([0, 0, 100]) self.UPPER_GRAY_2 = np.array([255, 80, 175])
def preprocessing(file,originalSamplingRate=48000,originalChannels=2, sampleRate=48000, channels=2, alpha = 0.97,frameLength=20,windowType ="hanning"): prep = Preprocessing() prep.readSoundFile(file,sr=originalSamplingRate,n_channels=originalChannels) prep.channelConversion(channels) prep.resampling(sampleRate) prep.pre_emphasis(alpha) prep.framing(frameLength) prep.addWindow(windowType) prep.vad() return prep.frames
def get_location_test(self): instance=Preprocessing(self.config) instance.get_location()
dataset = VideoLoader( args.csv, framerate=1 if args.type == '2d' else 24, size=224 if args.type == '2d' else 112, centercrop=(args.type == '3d'), ) n_dataset = len(dataset) sampler = RandomSequenceSampler(n_dataset, 10) loader = DataLoader( dataset, batch_size=1, shuffle=False, num_workers=args.num_decoding_thread, sampler=sampler if n_dataset > 10 else None, ) preprocess = Preprocessing(args.type) model = get_model(args) feat_root = args.feat_root with th.no_grad(): for k, data in enumerate(loader): input_file = data['input'][0] output_file = data['output'][0] output_path = os.path.join(feat_root, output_file) if len(data['video'].shape) > 3: print('Computing features of video {}/{}: {}'.format( k + 1, n_dataset, input_file)) video = data['video'].squeeze() if len(video.shape) == 4: video = preprocess(video) n_chunk = len(video) features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0)
from preprocessing import Preprocessing from language import Language from image import Image_modality import util from sklearn.metrics import classification_report, confusion_matrix import numpy as np from keras.models import Model, Input from sklearn.metrics import accuracy_score if __name__ == '__main__': pre = Preprocessing() df_input = pre.data train_data = pre.train_data test_data = pre.test_data Y_train = pre.Y_train Y_test = pre.Y_test lng = Language(df_input) print("Preparing the language data") train_tokens = train_data['title'].apply(util.get_tokens) lng_data_train = lng.get_encoded_data(train_tokens) test_tokens = test_data['title'].apply(util.get_tokens) lng_data_test = lng.get_encoded_data(test_tokens) language_model = lng.lng_model print("training the language model (bi-lstm), this might take some time") language_model.fit(lng_data_train, Y_train, verbose=1, validation_split=0.2, nb_epoch=5) ## printing precision_recall- language modality
def output_missing_scale_test(self): instance=Preprocessing(self.config) instance.output_missing_scale()
def processFrameOf(self, camera): if not camera.isUp(): #self.log("WARN", "Video stream for Camera: " + camera._id + " not available") return False maxt = 10 frame = None for i in range(1, maxt + 1): #self.log("INFO", "Trying to accesss frame {}/{}".format(i, maxt)) try: ret, f = camera.read() if ret: frame = f except: yyyyy = 1 if frame is None: #self.log("WARN", "Couldn't access a valid frame") return False if camera._id in self.preprocessings: self.log("INFO", "Pre-Processing frame of camera: " + camera._id) st = time.time() lineCoords = [(5, frame.shape[0] - 30 * (i + 1)) for i in range(3)] pp = self.preprocessings[camera._id] if 'brightness' in pp: bv = pp['brightness'] frame = Preprocessing.adjustBrightness(frame, bv) Preprocessing.putText(frame, "Brightness: " + str(bv), lineCoords[0]) if 'sharpness' in pp: sv = pp['sharpness'] frame = Preprocessing.sharpenImage(frame, k=sv) Preprocessing.putText(frame, "Sharpness: " + str(sv), lineCoords[1]) if 'denoise' in pp: dv = pp['denoise'] if dv > 0: frame = Preprocessing.denoiseImage(frame, strength=dv) Preprocessing.putText(frame, "denoise: " + str(dv), lineCoords[2]) et = time.time() self.log("TIME", "Action took {:2.6f}s".format((et - st))) #person detection #plt.imshow(frame) self.log("INFO", "Detecting People in the frame") bboxes, conf = self.pd.detect(frame, drawOnFrame=False) #overlapping bounding boxes self.log("INFO", "Applying nms") bboxes = non_max_suppression(np.array(bboxes), probs=None, overlapThresh=0.65) #tracking if len(bboxes) > 0: tbboxes, tids = camera.tk.track(frame, bboxes, conf, drawOnFrame=False) if len(tbboxes) > 0: self.log("INFO", "Tracking people {}".format(len(tids))) for i in range(len(tbboxes)): tbbox = np.array(tbboxes[i], np.int32) tid = tids[i] #increasing fps by selective recognition if camera.track.hasPerson(tid): if camera.track.people[tid].isSuspect(): if time.time() - camera.track.people[ tid].whenRecognized < self.recognizeThresh: continue person = frame[tbbox[1]:tbbox[3], tbbox[0]:tbbox[2]] #cv2.imshow("person: ", person) faces = fdr.extractFaces(person, drawOnFrame=False) if len(faces) <= 0: continue face = faces[0] fe = fdr.getEmbedding(face[0]) #check if he/she is a suspect suspectDetected = False for k, suspect in self.suspects.items(): #{"face":face, "em":em, "path":path} for pic in suspect.pictures: em = pic['em'] if fdr.is_match(em, fe): camera.track.suspectDetected( tid, suspect, time.time(), frame, self.SERVER_ID, camera._id) suspectDetected = True break if suspectDetected: break #update track camera.track.updatePositions(tbboxes, tids) camera.track.clearForgotten() #display bboxes and everything camera.track.draw(frame) #udpate the processedFrame #cv2.imshow("Frame", frame) t = time.localtime() text = "Server: " + time.strftime("%H:%M:%S", t) cv2.putText(frame, text, (10, 60), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 255, 255), 1) with self.lock: camera.processedFrame = frame camera.processedFrameTime = time.time() self.xo = 1 return True
class Summarization(object): def __init__(self, lang_code, method="LSA", n_words=200, k=1, sv_threshold=0.5, min_df=0, max_df=.1, use_idf=True): self.lang_code = lang_code self.method = method self.n_words = n_words self.k = k # num topics self.sv_threshold = sv_threshold self.min_df = min_df self.max_df = max_df self.use_idf = use_idf self.valid_langs = ["en"] if self.lang_code in self.valid_langs: self.p = Preprocessing(lang_code=lang_code) self.tfidf = TfidfVectorizer(min_df=min_df, max_df=max_df, use_idf=use_idf) def generate_doc_term_matrix(self, norm_sents): """ Generate document term matrix from normalized sentences """ dt_matrix = self.tfidf.fit_transform(norm_sents) dt_matrix = dt_matrix.toarray() return dt_matrix def generate_term_doc_matrix(self, dt_matrix): """ Generate term document matrix from document term matrix """ td_matrix = dt_matrix.T return td_matrix def generate_summary(self, sents, top_sentence_indices): """ Generate summary from original sentences using top sentence indices """ sents = np.array(sents) summary = "\n".join(sents[top_sentence_indices]) return summary def summarize(self, text, n_sents=3): """ Summarize a given text and get top sentences """ try: prediction = dict() if text: if self.lang_code in self.valid_langs: if Utility.get_doc_length(text) > self.n_words: # generate sentences, normalized sentences from text sents, norm_sents = self.p.text_preprocessing(text) # generate doc-term-matrix, term-doc-matrix dt_matrix = self.generate_doc_term_matrix(norm_sents) td_matrix = self.generate_term_doc_matrix(dt_matrix) if self.method == "LSA": lsa = LSA(self.k, td_matrix) term_topic_matrix, singular_values, topic_doc_matrix = lsa.u, lsa.s, lsa.vt # remove singular values below given treshold singular_values = lsa.filter_singular_values( singular_values, self.sv_threshold) # get salience scores from top singular values & topic document matrix salience_scores = lsa.get_salience_scores( singular_values, topic_doc_matrix) # get the top sentence indices for summarization top_sentence_indices = lsa.get_top_sent_indices( salience_scores, n_sents) summary = self.generate_summary( sents, top_sentence_indices) elif self.method == "TEXT_RANK": tr = TextRank(dt_matrix, td_matrix) # build similarity graph similarity_matrix = tr.similiarity_matrix similarity_graph = tr.get_similarity_graph( similarity_matrix) # compute pagerank scores for all sentences ranked_sents = tr.rank_sentences(similarity_graph) # get the top sentence indices for summarization top_sentence_indices = tr.get_top_sentence_indices( ranked_sents, n_sents) summary = self.generate_summary( sents, top_sentence_indices) else: return "no method found" # apply cleaning for readability summary = Utility.remove_multiple_whitespaces(summary) summary = Utility.remove_trailing_whitespaces(summary) prediction["summary"] = summary prediction["message"] = "successful" else: return "required at least {} words".format( self.n_words) else: return "language not supported".format() else: return "required textual content" return prediction except Exception: logging.error("exception occured", exc_info=True)
class Model(): """ Model Class """ def __init__(self): self.__utils = Utils() self.__preprocessing = Preprocessing() self.model_fit = self.fit() self.__preprocessing.execute() @property def dataset_preprocessed(self): """ Returns ------- DataFrame Houses Preprocessed """ df = pd.read_csv("data/houses_clean.csv") df.drop('Unnamed: 0', axis=1, inplace=True) return df def get_prepared_df(self): """ Prepare dataframe for modelling Parameters ---------- df : Dataframe Data Returns ------- Array to Model """ df = self.dataset_preprocessed df['size'] = df['size'].apply(lambda x: math.ceil(x / 5.0) * 5.0) #Property Type Union df['propertyType'] = np.where((df['propertyType'] == 'studio') | (df['propertyType'] == 'duplex'), 'flat', df['propertyType']) #Select Features df = df[[ 'price', 'size', 'propertyType', 'district', 'status', 'roomsCat', 'bathroomsCat' ]] #,'box_posto_auto','hasTerrace', #'hasGarden','hasSwimmingPool']] return df @property def labels_dataset(self): """ Returns ------- Labels Dataset Numpy Array """ df = self.get_prepared_df() labels = np.array(df['price']) return labels @property def features_dataset(self): """ Returns ------- Features Dataset Numpy Array """ df = self.get_prepared_df() features = df.drop('price', axis=1) features = np.array(features) return features @property def feat_tsf_dataset(self): """ Returns ------- Features Dataset Numpy Array with Category Encoders """ features = self.features_dataset labels = self.labels_dataset #Encoder encoder = ce.GLMMEncoder(cols=self.cat_index) #Encoder Cv cv_encoder = NestedCVWrapper(feature_encoder=encoder, cv=5, shuffle=True, random_state=7) #Apply Transform to all datasets feat_tsf = cv_encoder.fit_transform(features, labels) return feat_tsf @property def features_list(self): """ Returns ------- Features List """ df = self.get_prepared_df() features = df.drop('price', axis=1) # Saving feature names for later use feature_list = list(features.columns) return feature_list @property def n_features(self): """ Returns ------- Number of features """ return len(self.features_list) @property def cat_index(self): """ Returns ------- Index position categorical columns """ df = self.get_prepared_df() df.drop('price', axis=1, inplace=True) categorical_features_indices = np.where((df.dtypes != np.int) & (df.dtypes != np.float))[0] index = categorical_features_indices.reshape(1, -1).tolist()[0] return index def search_best_rf(self, n_trees=2500, saveStats=True): """ Seach Best Random Forest Model Parameters ---------- df : DataFrame prepared (method prepared_data) Returns ------- JSON File (model_params_rf.json). """ #Process Time start = time.time() #Datasets feat_tsf = self.feat_tsf_dataset labels = self.labels_dataset #Generate random state #min_samples_split_values to test max_features_list = np.arange(0.20, 0.66, 0.01).tolist() max_features_list = [round(elem, 2) for elem in max_features_list] max_features_list.append('sqrt') max_features_list.append('auto') #Get max n_trees max_n_trees = self.depth_of_trees.max()[0] max_depth_list = np.arange(int(max_n_trees / 4), max_n_trees, 1).tolist() max_depth_list.append(None) #min_impurity_decrease min_impurity_decrease_list = np.arange(0.01, 0.26, 0.01).tolist() min_impurity_decrease_list = [ round(elem, 2) for elem in min_impurity_decrease_list ] #min_samples_leaf_list.append(None) param_grid = { "max_features": max_features_list, "max_depth": max_depth_list, "min_impurity_decrease": min_impurity_decrease_list } #RF Model to test rf = RandomForestRegressor(bootstrap=True, oob_score=True, n_estimators=n_trees, random_state=7) #Define and execute pipe grid_cv = HalvingRandomSearchCV(estimator=rf, param_distributions=param_grid, random_state=7, max_resources='auto', verbose=3).fit(feat_tsf, labels) df_results = pd.DataFrame(grid_cv.cv_results_) #Save CV Results if saveStats: df_results.to_csv('data/cv_hyperparams_model.csv') print("Best Params:") print(grid_cv.best_params_) print("Saving model in 'model_params.joblib'") # Writing joblibfile with best model dump(grid_cv.best_estimator_, 'model_params.joblib') #Save json file with params best model json_txt = json.dumps(grid_cv.best_params_, indent=4) with open('model_params', 'w') as file: file.write(json_txt) #End Time end = time.time() time_elapsed = round((end - start) / 60, 1) return ('Time elapsed minutes: %1.f' % (time_elapsed)) def fit(self): """ Returns ------- Fit Best Params Model """ #Datasets feat_tsf = self.feat_tsf_dataset labels = self.labels_dataset #Open params with open('model_params', 'r') as file: params_model = json.load(file) #Model rf = RandomForestRegressor(**params_model) #Fit & Metrics rf.fit(feat_tsf, labels) oob_score = (rf.oob_score_) * 100 print("OOB Score: %.2f" % oob_score) return rf @property def oob_score(self): """ Returns ------- Best Model OOB Score """ return self.model_fit.oob_score_ @property def params(self): """ Returns ------- Best Model Params """ return self.model_fit.get_params() def predict( self, size, propertyType, district, status, rooms, bathrooms, #box_posto_auto, #hasGarden, #hasTerrace, #hasSwimmingPool ): """ Parameters ---------- district : str (category) status : str (category) rooms : int bathrooms : int box_posto_auto : Bool(1,0) garden : Bool(1,0) terrace : Bool(1,0) hasSwimmingPool : Bool(1,0) Returns ------- Prediction : Best Model Prediction """ """ #Avg Price Zone avg_price_zone_df = self.dataset_preprocessed[['district','avgPriceZone']] avg_price_zone_df = avg_price_zone_df.drop_duplicates() avgPriceZone = avg_price_zone_df.loc[ avg_price_zone_df['district']==district]['avgPriceZone'].values[0] """ #Rooms Category roomsCat = self.roomsCategory(rooms) #Bathrooms Logic bathroomsCat = self.bathroomsCategory(bathrooms) #Array for prediction array = np.array([ size, propertyType, district, status, roomsCat, bathroomsCat, #box_posto_auto, #hasGarden, #hasTerrace, #hasSwimmingPool ]).reshape(1, -1) #Encoder encoder = ce.GLMMEncoder(cols=self.cat_index) #Encoder CV KFold cv_encoder = NestedCVWrapper(encoder, cv=5, shuffle=True, random_state=7) #Datasets features = self.features_dataset labels = self.labels_dataset #Apply Transform to all datasets feat_tsf = cv_encoder.fit_transform(features, labels, array) #Prediction prediction = self.model_fit.predict(feat_tsf[1])[0] return prediction @property def permutation_importance(self): """ Permutation Features Importance Returns ------- Graph Permutation Importance """ #Datasets feat_tsf = self.feat_tsf_dataset labels = self.labels_dataset rf = load('model_params.joblib') #Fit rf.fit(feat_tsf, labels) #Permutation importance result = permutation_importance(rf, feat_tsf, labels, n_repeats=10, random_state=7, n_jobs=2) df = (pd.DataFrame({ "ft": self.features_list, 'imp_mean': result.importances_mean, 'imp_dsvt': result.importances_std })) df.sort_values(by='imp_mean', ascending=False, inplace=True) sorted_idx = result.importances_mean.argsort() fig, ax = plt.subplots() ax.boxplot(result.importances[sorted_idx].T, vert=False, labels=self.get_prepared_df().iloc[:, 1:].columns[sorted_idx]) ax.set_title("Permutation Importances") fig.tight_layout() return plt.show() def plot_tree(self, tree_number=0): """ Parameters ---------- number : Int. Tree to plot. The default is 0. Returns ------- Tree Image """ model_rf = self.model_fit fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 30), dpi=800) tree.plot_tree(model_rf.estimators_[tree_number], feature_names=self.features_list, class_names='price', filled=True) fig.savefig('data/rf_individualtree.png') return fig def feature_imp(self): """ Feature Importance Model Method Returns ------- Dataframe with features Importance """ df = (pd.DataFrame({ "ft": self.features_list, 'imp': self.model_fit.feature_importances_ })) df.sort_values(by='imp', ascending=False, inplace=True) return df @property def depth_of_trees(self): """ Returns ------- Dataframe with Trees depth """ #Get depth of trees max_depth_list = [] rf = RandomForestRegressor(n_estimators=2500, max_features=0.35) feat_tsf = self.feat_tsf_dataset labels = self.labels_dataset rf.fit(feat_tsf, labels) for i in rf.estimators_: max_depth_list.append(i.get_depth()) print("Max depht: %i trees" % max(max_depth_list)) return pd.DataFrame(max_depth_list, columns=['trees']) def train_test_samples(self, features, labels, test_size=0.20, random_state=None): feat_tsf = self.feat_tsf_dataset labels = self.labels_dataset X_train, X_test, y_train, y_test = train_test_split( feat_tsf, labels, test_size=test_size, random_state=random_state) return X_train, X_test, y_train, y_test def avg_price_district(self, district): df = self.dataset_preprocessed df = df.groupby('district').mean()['priceByArea'] return int(df.loc[df.index == district].values[0]) @property def propertyTypeList(self): propertyTypelist = ['Flat', 'Attic', 'Villa', 'Country House'] return propertyTypelist def propertyTypeConverter(self, propertyType): """ Parameters ---------- propertyType : Str Selected Property Type. Returns ------- Property Type str """ #Options list propertyTypelist = self.propertyTypeList #Lower elements list propertyTypelist = [i.lower() for i in propertyTypelist] #Assertion assert propertyType.lower() in propertyTypelist #Default Value propertyTypeOutput = 'flat' if propertyType.lower() == 'Flat': propertyTypeOutput = 'flat' elif propertyType.lower() == 'Attic': propertyTypeOutput = 'penthouse' elif propertyType.lower() == 'Villa': propertyTypeOutput = 'villa' elif propertyType.lower() == 'CountryHouse': propertyTypeOutput = 'countryHouse' return propertyTypeOutput @property def statusList(self): status_it = ['To be restructured', 'Good', 'New Construction '] return status_it def roomsCategory(self, rooms): """ Parameters ---------- rooms : Int Rooms Returns ------- Rooms Category """ roomsCat = 1 if rooms >= 4: roomsCat = 4 else: roomsCat = rooms return roomsCat def bathroomsCategory(self, bathrooms): """ Parameters ---------- bathrooms : Int bathRooms Returns ------- Barthooms Category """ bathroomsCat = 1 if bathrooms >= 2: bathroomsCat = 2 else: bathroomsCat = bathrooms return bathroomsCat
def evalSentence(self, model, sentence): return self.classificator.classifyWithModel( model, sentence, Preprocessing(con=self.con))
def __init__(self): self.__utils = Utils() self.__preprocessing = Preprocessing() self.model_fit = self.fit() self.__preprocessing.execute()
description=('Estimate the Gibbs energy of a reaction. For example,' 'the following calculates dGr0 for ATP hydrolysis ' 'at pH 6: calc_dGr0.py --ph 6 "C00002 + C00001 = ' 'C00008 + C00009"')) parser.add_argument('--ph', type=float, help='pH level', default=7.0) parser.add_argument('--i', type=float, help='ionic strength in M', default=0.1) parser.add_argument('reaction', type=str, help='reaction in KEGG notation') return parser ############################################################################### parser = MakeParser() args = parser.parse_args() logging.getLogger().setLevel(logging.WARNING) print 'pH: %.1f' % args.ph print 'I: %.1f M' % args.i print 'Reaction: ' + args.reaction # parse the reaction reaction = Reaction.parse_formula(args.reaction) p = Preprocessing() # use the preprocessing class to calculate the estimated dG0 and uncertainty dG0_prime, U = p.dG0_prime(reaction, pH=args.ph, I=args.i) print u'dGr0 = %.2f \u00B1 %.2f kJ/mol' % (dG0_prime[0, 0], U[0, 0])
output_2 = Dropout(0.1)(output_2) output_2 = Dense(32, activation='relu')(output_2) output_2 = Dropout(0.1)(output_2) output = Dense(outputdim, activation='softmax')(output_2) model = Model(inputs=[ input_scene, input_before_sents, input_sents, input_before_char ], outputs=output) return model if __name__ == '__main__': embedding_dim = 62 preprocessing = Preprocessing() characters, paraid2scene, paraid2chars, paraid2sents, episodeid2paraid, para_number, word_dict = preprocessing.load_dataset( ) _, id2char, _, id2word, char_number, word_number, embedding_matrix = preprocessing.encoding_reduction( characters, word_dict) X, Y, X_test_2, Y_test_2 = preprocessing.generate_X_Y_split_beforechar( characters, paraid2scene, paraid2chars, paraid2sents, episodeid2paraid, para_number, word_dict) print('data loaded') X_test_1 = [ np.array(X[0][10000:20000]), np.array(X[1][10000:20000]), np.array(X[2][10000:20000]), np.array(X[3][10000:20000]) ]
tuples = sorted(tuples, key=lambda x: x[0]) plt.figure(figsize=(10, 10)) key_color = map(lambda x: 1 if x.startswith('PRI') else 0, zip(*tuples)[1]) colors = np.asarray(['c', 'g']) plt.barh(bottom=pos, width=zip(*tuples)[0], color=colors[key_color], edgecolor=None, alpha=0.7) plt.yticks(np.arange(1, max(pos) + 0.5), zip(*tuples)[1], fontsize='small') plt.xlabel('Correlation', fontsize='small') plt.title('Ranking by point biserial correlation - Features v. Class', fontsize='small') plt.savefig('../Graphs/pbc.png') plt.tight_layout() if __name__ == "__main__": hd = HiggsData(path=settings.get('paths', 'path_data'), imputation=True) df = Preprocessing.remove_missing_values(hd.processed_input, np.NaN) b_processed = Preprocessing.get_features(df[df.Label == -1]) s_processed = Preprocessing.get_features(df[df.Label == 1]) labels = hd.raw_input.ix[df.index]['Label'] df_features = Preprocessing.get_features(df) cols = df_features.columns fishers = plot_fishers_ratio(b_processed, s_processed) pbc = plot_feature_class_corr_matrix(df_features, labels, cols)
def train(ink_dir, lg_dir): """ This function is used for training model :param ink_dir: :param lg_dir: :return: """ lg_files = os.listdir(lg_dir) pre = Preprocessing() feature_matrix = [] targets = [] c = 0 total = len(lg_files) for file in lg_files: print(file, total - c, c) symbols = {} with open(lg_dir + "/" + file) as f: for line in f: if line.startswith("O"): filt_line = line.strip().split(",") symbols[filt_line[1].strip()] = [ filt_line[2], filt_line[4:] ] inkml_file = file.replace(".lg", ".inkml") with open(ink_dir + "/" + inkml_file) as f: soup = bs.BeautifulSoup(f, 'html.parser') for key in symbols: label = symbols[key][0] strokes = symbols[key][1] id_list = [] X = [] Y = [] for id in strokes: st_id = id.strip() trace = soup.findAll("trace", {'id': st_id}) coords = trace[0].text.strip().split(",") x = [] y = [] for coord in coords: trace_parts = coord.strip().split(' ') x.append(float(trace_parts[0])) y.append(float(trace_parts[1])) X.append(x) Y.append(y) id_list.append(st_id) X, Y = pre.dopreprocess(x=X, y=Y, parser=True) symbols[key] = Symbol(label=label, x=X, y=Y, stroke_id=id_list) # relations section with open(lg_dir + "/" + file) as f: for line in f: if line.startswith("EO"): filt_line = line.strip().split(",") sym1 = symbols[filt_line[1].strip()] sym2 = symbols[filt_line[2].strip()] relation = filt_line[3].strip() writing_slope = sym1.writing_slope(sym2) writing_curve = sym1.writing_curvature(sym2) bb_dist = sym1.distance_between_box(sym2) distance, horizontal_ofsset, vertical_distance = sym1.distance_between_average_centres( sym2) max_point_pair = sym1.maximal_point_distance(sym2) feature_matrix.append([ writing_slope, writing_curve, bb_dist, distance, horizontal_ofsset, vertical_distance, max_point_pair ]) targets.append(relation) c += 1 print("Shape of Training matrix") print(len(feature_matrix), "x", len(feature_matrix[0])) print("Unique labels : ", np.unique(targets)) rf = RandomForestClassifier(n_estimators=100, n_jobs=-1) rf.fit(X=feature_matrix, y=targets) joblib.dump(rf, "relation_classifier_bonus.pkl", protocol=pickle.HIGHEST_PROTOCOL) rf = joblib.load("relation_classifier_bonus.pkl") score = accuracy_score(y_true=targets, y_pred=rf.predict(feature_matrix), normalize=True) print("accuracy of model is :", (score * 100))
def apply_preprocessing(self): data = self.get_data() data = Preprocessing().categorical_column_to_numerical(data) data = Preprocessing().normalize_numerical_columns(data) return data
fmeasure_pos = [] acc_per_fold = [] f = open("stopword_tala.txt", "r") stopwords = f.read().split() fold = list(range(1, 11)) for i in range(len(data_train)): print("Fold ke " + str(i + 1)) # print(len(data_train[i]["tweet"])) # print(len(data_test[i]["tweet"])) y_test = [] y_pred = [] # TAHAP PEMBUATAN STOPWORD prepro = Preprocessing() new_cleaned_data, new_terms = prepro.preprocessing(data_train[i]["tweet"], stopwords=stopwords) # TAHAP PELATIHAN weight = Weighting(new_cleaned_data, new_terms) tfidf = weight.get_tf_idf_weighting() idf = weight.get_idf() nb = NBMultinomial() nb.fit(new_cleaned_data, new_terms, data_train[i]["target"], stopwords, idf, tfidf) for j in range(len(data_test[i]["tweet"])): print("Test ke " + str(j))
def __init__(self): self.pp = Preprocessing()
from preprocessing import Preprocessing from model import UnetModel data_path = '..' """ To train different classes for buildings: class_dict = {1: 0} for road: class_dict = {3: 0} for tracks: class_dict = {4, 0} """ class_dict = {1:0} Patch_size = 224 N_split = 15 n_classes = len(class_dict) inp_shape = (Patch_size, Patch_size, 20) preprocessor = Preprocessing(data_path, class_dict) unet_model = UnetModel(inp_shape, n_classes) unet_model.getModel(0.2) print ("Model generated") unet_model.compileModel() print ("Model compiled") epochs = 100 batch_size = 16 train_image_data_gen = preprocessor.imagePatchGenerator(batch_size) val_image_data_gen = preprocessor.imagePatchGenerator(batch_size, val_data = True) print("batch generators generated") print("Training...") trained_model = unet_model.train_generator(batch_size,
from keras import optimizers, losses, metrics from preprocessing import Preprocessing from simulation import Simulation from spp import SpatialPyramidPooling, R2 import numpy as np import matplotlib.pyplot as plt # Parameters setting num_of_cells = 2 num_of_CUEs = 2 num_of_D2Ds = (2, 3) batch_size = 64 epochs = 10 # Get the image data format which Keras follows image_data_format = Preprocessing.GetImageDataFormat() # Get the input data and target data input_data_list = [ Preprocessing.GetInputData(num_of_cells, num_of_CUEs, i, (2000, 8000, 10000), image_data_format) for i in num_of_D2Ds ] target_data_list = [ Preprocessing.GetTargetData(num_of_cells, num_of_CUEs, i, (2000, 8000, 10000)) for i in num_of_D2Ds ] # Reshape the input data for index, input_data in enumerate(input_data_list): rows, cols, channels = Preprocessing.GetInputShape(input_data)
def output_category_num_test(self): instance=Preprocessing(self.config) instance.output_category_num_scale()
def __init__(self, path, limit): self.path = path self.limit = limit self.preprocessing = Preprocessing()
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel from gensim.models.wrappers import LdaMallet from gensim.corpora import Dictionary from gensim.matutils import argsort from gensim import corpora, models, similarities from gensim.topic_coherence import segmentation, probability_estimation, direct_confirmation_measure, aggregation,indirect_confirmation_measure from pprint import pprint from collections import namedtuple from sklearn.decomposition import LatentDirichletAllocation from preprocessing import Preprocessing from sklearn.model_selection import GridSearchCV from custom_vectorizer import OwnCountVectorizer # nltk.download('words') preprocessing = Preprocessing() def prepare_data(filename): ''' Load and prepare the data for topic modeling. ''' print 'Loading dataset...' data = pd.read_csv(filename, encoding = 'utf-8') data = preprocessing.remove_null(data) print 'Dataset loaded' print 'Preparing text inputs...' texts = preprocessing.preprocess_text(texts) titles = preprocessing.preprocess_text(titles) titles = titles[0:len(texts)] text_input = concat_text_input(titles, texts) text_data = data['scraped_title'] + ' ' + data['scraped_content'] return text_input, text_data
import pandas as pd from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings('ignore') from preprocessing import Preprocessing Preprocessing = Preprocessing() from models import models models = models() import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, auc, roc_curve, normalized_mutual_info_score data = pd.read_csv('./data.csv',index_col=0) data.drop(['patkey', 'index_date', 'MATCHID'], axis=1) data['age_at_index'] = data['age_at_index']-5 data = Preprocessing.FeatureEncoding(data) data = Preprocessing.MissingData(data) data.to_csv('data_complete.csv') data = pd.read_csv('./data_complete.csv',index_col=0) #========================================================================================== #After using the KNN to deal with missing data, count and plot the histogram of features ''' print(data.loc[:,'Smoking_status'].value_counts()) print(data.loc[:,'BMI_group'].value_counts()) print(data.loc[:,'Alcohol_status'].value_counts()) def autolabel(rects): for rect in rects: height = rect.get_height() plt.text(rect.get_x()+rect.get_width()/2.-0.2, 1.03*height, '%s' % int(height))
def save_trails(encoder, alg, scr, data, path): """ pre_parameters= pre_encoder(encoder)[0] preprocessor =pre_encoder(encoder)[1] alg: Algorithm, a dict(), each one is a name of that function {'knn':[KNN(),knn_params]} scr: scorings, a dict(), each one is a function such as 'acc':ACC data: a dict() of dataset {'dataname':[trainset, testset]} path = ['~/all_models','~/best_models'] , to decide first chart or second chart. """ # parameters params = Preprocessing.pre_encoder(encoder)[0] preprocessor_ = Preprocessing.pre_encoder(encoder)[1] alg_name = list(alg)[0] data_name = list(data)[0] clsf = alg[alg_name][0] param = alg[alg_name][1] params.append(param) dataset = data[data_name][0] test_set = data[data_name][1] pipeline = Pipeline([('preprocessing', preprocessor_), ('classifier', clsf)]) X = dataset.drop(columns=['target']) y = dataset.target print(y.unique()) if len(y.unique()) > 2: # this will transform y into 1,0 ndarray lb = LabelBinarizer().fit(y) y = lb.transform(y) record_scores = {} for i in range(len(list(scr))): score_name = list(scr)[i] score = scr[score_name] print("Dataset is : ", data_name.upper()) record_scores[score_name] = {} score_details = {} X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) clf = GridSearchCV(pipeline, params, scoring=score, cv=5, n_jobs=-1, refit=True, return_train_score=True, verbose=True) clf.fit(X_train, y_train) # test dataset best_model = clf.best_estimator_ X_test = test_set.drop(columns=['target']) y_test = test_set.target y_pred = best_model.predict(X_test) testset_score = (y_test, y_pred) score_details['testset_score'] = testset_score score_details['best_score'] = clf.best_score_ score_details['results'] = clf.cv_results_ # save models to path save_models = os.path.join(path[0], alg_name, data_name, score_name) save_best_model = os.path.join(path[1], alg_name, data_name, score_name) self.check_directory([save_best_model, save_models]) joblib.dump(clf, os.path.join(save_models, 'all_models.pkl')) joblib.dump(best_model, os.path.join(save_best_model, 'best_model.pkl')) print('All models scored in ' + score_name + ' saved in ', save_models) print('Best model scored in ' + score_name + ' saved in ', save_best_model) record_scores.update(score_details) print(record_scores) return record_scores
class Segmentation: def __init__(self): self.pre_processing = Preprocessing() self.LOWER_GRAY_2 = np.array([0, 0, 100]) self.UPPER_GRAY_2 = np.array([255, 80, 175]) def segment(self, image): mask, image_no_background = self.pre_processing.cut_out_backgound(image) image_hue = self.pre_processing.get_mask_brightness(image_no_background) #cv2.imshow("img_hue", image_no_background) mat_points = self.map_out(mask, image_no_background) image = self.pre_processing.equalize_clahe(image) return mat_points, image#image_no_background def map_out(self, img_bin, image): mat_points = [] contours, inheriters = cv2.findContours(img_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for c in contours: moments = cv2.moments(c) if moments['m00'] > 100: x = [] y = [] for i in c: for j in i: x.append(j[0]) y.append(j[1]) max_x, min_x, max_y, min_y = np.argmax(x), np.argmin(x), np.argmax(y), np.argmin(y) mat_points.append((x[min_x], y[min_y], x[max_x], y[max_y])) return mat_points def get_points_min_max(self, array): x, y = cv2.split(array) min_x = min(x)[0] x = list(x) pos_min_x = x.index(min_x) min_y = y[pos_min_x][0] max_x = max(x)[0] pos_max_x = x.index(max_x) max_y = y[pos_max_x][0] return (min_x, min_y),(max_x, max_y) def highlight_smoke_contours(self, image): mask = self.pre_processing.enhance_color(self.LOWER_GRAY_2, self.UPPER_GRAY_2, image) contours, cany = self.pre_processing.border_image(mask, image) i = 0 sub_mats = [] for c in contours: extension = cv2.contourArea(c) if extension > 600: contour = contours[i] (x_min, y_min), (x_max, y_max) = self.get_points_min_max(contour) sub_mats.append((x_min, y_min, x_max, y_max)) self.pre_processing.draw_image(image, 1, [(x_min, y_min), (x_max, y_max)], (255,0,0)) self.pre_processing.draw_image(image, 2, contour, (0,255,0)) self.pre_processing.draw_image(cany, 2, contour,(0,255,0)) i = i + 1 return image, cany, sub_mats
def perfectly_segmented_parser(ink_dir, bonus=False): """ This is a parser for perfectly segmented symbols :param ink_dir: inkml directory :param bonus: boolean for bonus :return: """ start = time.time() lg_dir = dir.strip().split("/")[0] + "_output_lg" if not os.path.exists(lg_dir): os.mkdir(lg_dir) ink_files = os.listdir(ink_dir) if bonus: print("Loaded Bonus classifier") clf = joblib.load("relation_classifier_bonus.pkl") else: print("Loaded relationship classifier") clf = joblib.load('relation_classifier4.pkl') pre = Preprocessing() total = len(ink_files) c = 0 gt_c = 0 for file in ink_files: print("Processing file : ", file, " Files remaining : ", total - c, " Files completed : ", c) f = open(os.path.join(ink_dir, file)) soup = bs.BeautifulSoup(f, 'html.parser') trace_groups = soup.find_all('tracegroup') symbol_list = [] #loop to isolate symbols for tracegroup in trace_groups[1:]: traceview = tracegroup.find_all('traceview') trace_id = [] #loop to get strokes in a single symbol for t in traceview: trace_id.append(t['tracedataref']) gt = tracegroup.annotation.text gt_c += 1 X = [] Y = [] #extract stroke coordinates for id in trace_id: traces = soup.findAll("trace", {'id': id}) for trace in traces: coords = trace.text.strip().split(",") x = [] y = [] for coord in coords: trace_parts = coord.strip().split(' ') x.append(float(trace_parts[0])) y.append(float(trace_parts[1])) X.append(x) Y.append(y) X, Y = pre.dopreprocess(x=X, y=Y, parser=True) if gt == ",": gt = "COMMA" sym_obj = Symbol(x=X, y=Y, label=gt, stroke_id=trace_id) symbol_list.append(sym_obj) symbol_count = {} #Run through list of symbols to get their count for sym in symbol_list: if sym.symbol not in symbol_count: symbol_count[sym.symbol] = 1 sym.sym_ct = symbol_count[sym.symbol] else: symbol_count[sym.symbol] += 1 sym.sym_ct = symbol_count[sym.symbol] #perform line of sight graph, labels = line_of_sight(symbol_list, clf) #run edmonds on los graph relations = edmonds(graph) #write result to lg write_to_lg(file=file, symbol_list=symbol_list, relations=relations, labels=labels, lg_dir=lg_dir) c += 1 print("System executed in ", (time.time() - start) / 60, " minutes.")
import metrics_reduced from preprocessing import Preprocessing import codecs from collections import Counter from constants import * s = metrics_reduced.SpanishTools() p = Preprocessing() with codecs.open('corpus/Version_2_classes/corpus_2_classes.txt', 'r', 'utf-8') as file: features = {} line = file.readline() while line: line_bigram_list = s.n_grams(p.preprocessing(line.split('/|/')[1])) for e in line_bigram_list: if e in features: features[e] += 1 else: features[e] = 1 line = file.readline() with codecs.open('bigram_features_filtered.txt', 'w', 'utf-8') as file: counter = Counter(features) common = counter.most_common(10000) temp_list=[] for k in common: for e in k[0].split(): if e in positive_words or e in negative_words: temp_list.append(k[0])
def get_city_rank_test(self): instance=Preprocessing(self.config) instance.output_city_rank()