def main(components): #Connection to Neo4j graph = Graph("http://localhost:7474/db/data/cypher", password="******") #Obtain the features for corresponding InterMine Model feature_array = create_features() #Obtaining the list of Genes genes, length_genes = get_genes(graph) #Treating each Gene as a set sets = create_gene_documents(feature_array) #Singles mapped to ID's shingles = generate_shingle_id(sets) #Get signature based on the Shingle ID's signatures = generate_signatures(shingles, components) #similarity_matrix = get_similarity_matrix(signatures,len(genes),components) #return similarity_matrix b = 10 r = 4 #Obtain the matrix formed due to Locality Sensitive Hashing lsh_matrix = LSH(b, r, signatures, components) #Candidate Genes for Close Inspection candidate_gene = candidate_genes(lsh_matrix) #Use the information regarding candidate genes to obtain similarity scores final_similarity = get_similar_genes(candidate_gene, genes)
def predict_extended(): f = request.files['data_file'] if not f: return "No file" stream = io.StringIO(f.stream.read().decode("UTF8"), newline=None) stream.seek(0) result = transform(stream.read()) df = pd.read_csv(StringIO(result)) # Preprocessing & Feature Building X = create_features(df, chunk_size, 5, 1, .5) X = pd.DataFrame(columns=features, data=X) array_preds = extended_model.predict(X) prediction = stats.mode(array_preds)[0][0] display_text = "The predicted resolutions for each interval are: {}. \n Overall, " \ "the most commonly predicted resolution is: {}.".format(array_preds, prediction) return render_template('index.html', prediction_text_extended=display_text)
def tokenize_words(data: np.ndarray) -> np.ndarray: """[summary] tokenize the words, and add additional features Arguments: data {[numpy array]} -- the data Returns: [numpy array] -- the tokenized data, with additional features """ tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) lines = data[:, 6] tokenizer.fit_on_texts(lines) x = tokenizer.texts_to_sequences(lines) x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH) additional_2 = np.array( [features.create_features(data[i])[0] for i in range(len(data))]) x = np.hstack((x, additional_2)) return x
def run_model(): # Get the Data df = acquire.get_telco_data() # Prepare the Data df = prepare.drop_columns(df) df = prepare.fix_dtypes(df) # Add Features df = features.create_features(df) # Encode DataFrame df = encode.encode_df(df) # Select features to be used in the model cols = ['contract_type', 'tenure', 'monthly_charges', 'payment_type', 'has_internet'] X = df[cols] y = df.churn # Create and fit the model forest = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=123).fit(X, y) # Create a DataFrame to hold predictions results = pd.DataFrame( {'Costumer_ID': df.customer_id, 'Model_Predictions': forest.predict(X), 'Model_Probabilities': forest.predict_proba(X)[:,1] }) # Generate csv results.to_csv('model_results.csv') return results
def main_operation(): #Connection to Neo4j graph = Graph("http://localhost:7474/db/data/cypher", password="******") #Obtain the features for corresponding InterMine Model feature_array = create_features() #Obtaining the list of Genes genes, length_genes = get_genes(graph) #Computing singular sets for each gene as a document =: Type : Lists gene_documents = create_gene_documents(feature_array) #Conversion into Feature Vectors tfidf_vectors = compute_tfidf(gene_documents) #Obtaining the cluster labels cluster_labels = compute_clusters(tfidf_vectors) #Obtain clusters in form of Gene ID's gene_clusters = get_gene_clusters(cluster_labels, genes) print gene_clusters
def main_transitions(self): self.labels = self.nodes_labels() #labels(node,(label, amount)) self.train_person, self.test_person, self.train, self.test = self.train_test_split( ) # self.train_person = pickle.load(open("./dataset/"+self.dataset_name+"/pkl/train_person.pkl","rb")) # self.test_person = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/test_person.pkl", "rb")) # self.train = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/train_per_year.pkl", "rb")) # self.test = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/test_per_year.pkl", "rb")) # self.labels = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/labels.pkl", "rb")) #creating the network, connecting the neighbors edges and the timed edges mg_dict = self.sort_by_years() mg = self.create_multigraph(mg_dict) self.community_gnx, total_timed_edges = self.create_gnx(mg) # creating input for graphs and labels for the GCN (graph per timestamp, labels, features matrices) preparations.main_prep(self.dataset_name, self.edges_path, self.nodes_path, self.number_of_unique_labels, self.dataset_time_range[0]) create_features( self.dataset_name, self.time_inds) # creating feature matrices for the GCN params_ = { "data_name": self.dataset_name, # parameters of the GCN model "net": Net, "epochs": Epochs, "activation": "relu", "dropout_rate": Dropout_Rate, "hidden_sizes": Hidden_Sizes, "learning_rate": Learning_Rate, "weight_decay": Weight_Decay, "time_ins": Time_Inds, "num_of_classes": self.number_of_unique_labels } self.similarity_edges = run_trial(params_) #runs the GCN model t = time.time() for u, v in self.similarity_edges: #adding the similarity edges to the graph with similarity factor weight self.community_gnx.add_edge(u, v, weight=self.similarity_factor) print("adding similarity edges time: ", time.time() - t) self.cd = self.com_det() self.com_nodes = self.cd_com_nodes() print("number of communities: ", len(self.com_nodes)) # self.cd = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/cd_" + str(self.name) + ".pkl", "rb")) # self.com_nodes = pickle.load(open("./dataset/" + self.dataset_name + "/pkl/com_nodes_" + str(self.name) + ".pkl", "rb")) # print("number of communities: ", len(self.com_nodes)) # paint communities t = time.time() train_com_nodes = self.com_nodes_t(self.train) train_com_labels = self.com_label(train_com_nodes) self.top_label = self.paint_com(train_com_labels) print("number of communities painted: ", len(self.top_label)) print("paint communities time: {:.4f}".format(time.time() - t)) self.test_com_nodes = self.com_nodes_t(self.test) test_com_label = self.com_label(self.test_com_nodes) # top_label_test = self.paint_com(test_com_label) self.com_size_test = { com: len(test_com_label[com]) for com in test_com_label } com_size_train = { com: len(train_com_labels[com]) for com in train_com_labels } # Accuracy self.node_com_top_label = self.node_comlabel() # accuracy train t = time.time() self.check_communal_accuracy_t(self.train, com_size_train, "train") print("accuracy train time: {:.5f}".format(time.time() - t)) # accuracy test t = time.time() self.check_communal_accuracy_t(self.test, self.com_size_test, "test") print("accuracy test time: {:.5f}".format(time.time() - t)) # total accuracy total_accuracy = self.total_painting_accuracy() print("total accuracy in painted communities: {:.5f}".format( total_accuracy)) # Entropy # entroy train t = time.time() self.check_communal_entropy(self.train, com_size_train, "train", train_com_labels) print("entropy train time: {:.5f}".format(time.time() - t)) # entropy test t = time.time() self.check_communal_entropy(self.test, self.com_size_test, "test", test_com_label) print("entropy test time: {:.5f}".format(time.time() - t)) # transitions self.transitions_results() self.plot_compaint() self.plot_changes_per_year()
import numpy as np import cPickle from features import create_features, PROJECT from parse import load_data from dict_vectorizer import DictVectorizer videos, users, reviews = load_data() orig_X = np.array([(x['date'], x['text'], x['user']) for x in reviews]) feats = create_features(orig_X, None) v = DictVectorizer(sparse=False) feats = v.fit_transform(feats) # feats is now in vectorized format # v.transform() is the transformation that needs to be used on test data cPickle.dump(v, open(PROJECT + "db/dictvectorizer.pickle", "wb"))
import fix_paths from models.commit import Commit import common import config import features import pickle from sklearn import svm session = common.Session() svc_file = open(config.SERIALIZED_SVC_LOCATION) clf = pickle.loads(svc_file.read()) print 'Classifying all commits.' count = 0 for commit in session.query(Commit).all(): commit.classification = int(clf.predict([features.create_features(commit)])[0]) session.add(commit) count += 1 if count % 1000 == 0: print count session.commit()
def get_real_data(): df = util.load_data_to_dataframe('dataset/val_test_split.json') unseen_test = create_features(df) train_feats, train_labels, _ = get_feats_labels_ids(unseen_test) return train_feats, train_labels
""" Runs the project. """ # Import local methods. import loading import cleaning import features import training # Load raw data into datasets. loading.load() # Clean data. cleaning.clean() # Create features. features.create_features() # Train neural network. training.train_neural_network() # Train logistic regression. training.train_logistic_regression()