def export_to_csv( self, results: Collection[RepoDiffMetrics], ) -> None: header: Tuple[str, ...] = ('role id', 'v1', 'v2') change_cats = sorted(c.__name__ for c in get_diff_category_leafs()) header = tuple(chain(header, change_cats)) csv_lines: List[Tuple[Union[str, int, None], ...]] = [] for repo_metrics in results: role_id = repo_metrics.id for metrics in repo_metrics.metric_map.values(): v1 = metrics.v1 v2 = metrics.v2 summ = metrics.metric_summary csv_line: Tuple[Union[str, int, None], ...] = (role_id, v1, v2) if summ is None: csv_line = tuple(chain( csv_line, [None] * len(change_cats))) else: csv_line = tuple(chain( csv_line, (summ[cat] for cat in change_cats))) csv_lines.append(csv_line) write_csv(self.out / 'diff_metrics.csv', header, csv_lines)
def run(): print("Performing Base MLP") print("-------------------") # Train with dataset mlp_clfr = MLPClassifier(activation="logistic", solver="sgd") #100 neurons by default mlp_model1 = mlp_clfr.fit(train1_x, train1_y) mlp_model2 = mlp_clfr.fit(train2_x, train2_y) # Predict trained model with dataset y_predict1 = mlp_model1.predict(test1_x) y_predict2 = mlp_model2.predict(test2_x) # Evaluate score on dataset eval_dataset(1, mlp_model1) # Plot confusion matrix c_matrix1 = plot_confusion_matrix(mlp_model1, test1_y, y_predict1) # Print precision, recall, f1-score, accuracy, macro-avg f1, weighted-avg f1 print_model_details(test1_y, y_predict1) # Repeat steps for dataset 2 eval_dataset(2, mlp_model2) test2_y_predict = mlp_model2.predict(test2_x) c_matrix2 = plot_confusion_matrix(mlp_model2, test2_y, y_predict2) print_model_details(test2_y, y_predict2) # Output results into file util.write_csv("./output/Base-MLP-DS1.csv", test1_y, y_predict1, c_matrix1) util.write_csv("./output/Base-MLP-DS2.csv", test2_y, y_predict2, c_matrix2)
def main(args): # Set revcomp parameter. if args.r != 1: args.r = False elif args.r == 1 and args.alphabet != 'DNA': print("Error, the -r parameter can only be used in DNA.") elif args.r == 1 and args.alphabet == 'DNA': args.r = True # Set alphabet parameter. if args.alphabet == 'DNA': args.alphabet = index_list.DNA elif args.alphabet == 'RNA': args.alphabet = index_list.RNA elif args.alphabet == 'Protein': args.alphabet = index_list.PROTEIN res = make_kmer_vector(k=args.k, alphabet=args.alphabet, filename=args.inputfile, revcomp=args.r) # Write correspond res file. if args.f == 'svm': from util import write_libsvm write_libsvm(res, [args.l] * len(res), args.outputfile) elif args.f == 'tab': from util import write_tab write_tab(res, args.outputfile) elif args.f == 'csv': from util import write_csv write_csv(res, args.outputfile)
def run(): f = open('/home/kirayue/final/metadata', 'r') metadata = json.load(f) label = open('dataset/t1_train_label.txt', 'r').readlines() X = [] y = [] for ind, v in enumerate(label): if str(ind) in metadata: feature = [] cur_metadata = metadata[str(ind)] feature.append(cur_metadata['commentCount']) feature.append(cur_metadata['num_groups']) feature.append(cur_metadata['viewCount']) feature.append(cur_metadata['faveCount']) X.append(feature) y.append(float(label[ind].replace('\n', ''))) train_num = int(len(X) * 0.8) reg = RandomForestRegressor(n_estimators=400, n_jobs=50) print("Training ...") reg.fit(X[:train_num], y[:train_num]) y_t = reg.predict(X[train_num:]) print("MAE: {}".format(mean_absolute_error(y[train_num:], y_t))) print("MSE: {}".format(mean_squared_error(y[train_num:], y_t))) util.write_csv(y_t, y[train_num:], 'result/meta_predict.csv')
def main(args): #TODO:args.method will be finished #TODO:args.inputfile, name if args.alphabet == "RNA": if args.method.upper() == 'TRIPLET': res = get_triplet_matrix(args.inputfile) elif args.method.upper() == 'PSESSC': if args.k is None: print "parameters k is required. The default value of k is 2." args.k = 2 if args.r is None: print "parameters r is required. The default value of r is 2." args.r = 2 if args.w is None: print "parameters w is required. The default value of w is 0.1." args.w = 0.1 res = get_psessc_matrix(args.inputfile, args.k, args.r, args.w) elif args.method.upper() == 'PSEDPC': if args.n is None: print "parameters n is required. The default value of d is 0." args.n = 0 if args.r is None: print "parameters r is required. The default value of r is 2." args.r = 2 if args.w is None: print "parameters w is required. The default value of w is 0.1." args.w = 0.1 res = get_psedpc_matrix(args.inputfile, args.n, args.r, args.w) else: print("Method error!") else: print("sequence type error!") # Write correspond res file. if args.f == 'tab': from util import write_tab write_tab(res, args.outputfile) elif args.f == 'svm': if args.multi == 0 and args.l is None: args.l = '+1' elif args.multi == 0 and (args.l != '+1' and args.l != '-1'): print "For binary classification, the label should be either '+1' or '-1'." return False elif args.multi == 1 and args.l is None: args.l = '0' elif args.multi == 1 and args.l is not None: try: label = int(args.l) except ValueError: print 'The labels should be integer.' return False from util import write_libsvm write_libsvm(res, [args.l] * len(res), args.outputfile) elif args.f == 'csv': from util import write_csv write_csv(res, args.outputfile)
def run(): data = h5py.File("image_arr_N_224_224_3.processed.hdf5", "r") test_X = data['X'][300000:] test_y = data['y'][300000:] model = load_model('model/fine_tune_VGG19.train_conv.train_fc.model') y_t = model.predict(test_X) print("MAE: {}".format(mean_absolute_error(test_y, y_t))) print("MSE: {}".format(mean_squared_error(test_y, y_t))) util.write_csv(y_t, test_y, 'result/cnn.csv')
def build_author_key_csv(author_key_list, authors): csv = [] for author in authors: row = [author['id'], author['first_name'], author['last_name'], author['email']] for key in author['keys']: csv += [row + [key, 'x', make_author_link(key)]] write_csv(author_key_list, ['id', 'first_name', 'last_name', 'email', 'key', 'valid', 'key_link'], csv)
def main(args): with open(args.inputfile) as f: k = read_k(args.alphabet, args.method, 0) # Get index_list. if args.i is not None: from pse import read_index ind_list = read_index(args.i) else: ind_list = [] default_e = [] # Set Pse default index_list. if args.alphabet == 'DNA': args.alphabet = index_list.DNA if k == 2: default_e = const.DI_INDS_6_DNA elif k == 3: default_e = const.TRI_INDS_DNA elif args.alphabet == 'RNA': args.alphabet = index_list.RNA default_e = const.DI_INDS_RNA elif args.alphabet == 'Protein': args.alphabet = index_list.PROTEIN default_e = const.INDS_3_PROTEIN theta_type = 1 if args.method in const.METHODS_AC: theta_type = 1 elif args.method in const.METHODS_CC: theta_type = 2 elif args.method in const.METHODS_ACC: theta_type = 3 else: print("Method error!") # ACC. if args.e is None and len(ind_list) == 0 and args.a is False: # Default Pse. res = acc(f, k, args.lag, default_e, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) else: res = acc(f, k, args.lag, ind_list, args.alphabet, extra_index_file=args.e, all_prop=args.a, theta_type=theta_type) # Write correspond res file. if args.f == 'tab': from util import write_tab write_tab(res, args.outputfile) elif args.f == 'svm': from util import write_libsvm write_libsvm(res, [args.l] * len(res), args.outputfile) elif args.f == 'csv': from util import write_csv write_csv(res, args.outputfile)
def download(project_name, url, storyPointKey): try: jira = JIRA(url, basic_auth=(credential.username, credential.password)) except Exception: jira = JIRA(url) status = "Resolved, Done, Closed" # jql = 'project=' + project_name + \ # ' AND status in (' + status + ')' + \ # ' AND "' + storyPointKey + '" > 0' # AND "Actual Story Points" > 0' jql = 'project=' + project_name + \ ' AND status in (' + status + ')' + \ ' AND "' + storyPointKey + '" > 0 AND "Actual Story Points" > 0' block_size = 100 block_num = 0 header_fields = None data_list = [] while True: start_idx = block_num * block_size # just duplicate it, deepcopy sucks original_issues = jira.search_issues(jql, start_idx, block_size, expand="changelog") latest_issues = jira.search_issues(jql, start_idx, block_size, expand="changelog") issue_field_name_id, issue_field_id_name = get_field_name_id_list(jira) if len(original_issues) == 0: # Retrieve issues until there are no more to come break block_num += 1 print("BLOCK = " + block_num.__str__()) for x in range(len(original_issues)): original_issue = original_issues[x] latest_issue = latest_issues[x] # field_names, history = original_issue_extractor.run(original_issue, latest_issue, issue_field_name_id, issue_field_id_name) field_names, history = issue_extractor.run(original_issue, latest_issue, issue_field_name_id, issue_field_id_name) for item in history: data_list.append(item) header_fields = field_names # lowercase only # print('%s: %s' % (issue.key, issue.fields.summary)) # import csv util.write_csv(filename=project_name, field_names=header_fields, data_records=data_list)
def build_paper_csv(pub_list, authors, whitelist): schema = ['id', 'first_name', 'last_name', 'email', 'keys', 'valid', 'pub_key', 'pub_title', 'put_year', 'pub_authors'] csv = [] for k, author in authors.items(): row = [k, author['first_name'], author['last_name'], author['email'], ";".join(author['keys']), 'x'] for pub in author['pubs']: csv += [row + [pub['key'], pub['title'], pub['year'], ';'.join(pub['authors'])]] write_csv(pub_list, schema, csv)
def test1(): fname = "./data/temperature.csv" city = "Denver" #city = "New York" #city = "Kansas City" #city = "Seattle" data = load(fname, city) util.write_csv(data, "./data/denver.csv") dat = [(city, data)] util.plot_figs(dat, "temprature") return
def write_mw_prefixed_roots(prefixed_roots, unprefixed_roots, prefix_groups, sandhi_rules, out_path): """Parse the prefixes in a prefix root and write the parsed roots.""" with util.read_csv(prefix_groups) as reader: prefix_groups = {x['group']: x['prefixes'] for x in reader} with util.read_csv(unprefixed_roots) as reader: root_set = {(x['root'], x['hom']) for x in reader} candidate_homs = [None] + [str(i) for i in range(1, 10)] sandhi = make_sandhi_object(sandhi_rules) rows = [] for row in util.read_csv_rows(prefixed_roots): for group in sandhi.split_off(row['prefixed_root'], row['unprefixed_root']): if group in prefix_groups: basis, hom = row['unprefixed_root'], row['hom'] if (basis, hom) not in root_set: for x in candidate_homs: if (basis, x) in root_set: hom = x break if (basis, hom) not in root_set: continue rows.append((row['prefixed_root'], prefix_groups[group], row['unprefixed_root'], hom)) break labels = ['prefixed_root', 'prefixes', 'unprefixed_root', 'hom'] with util.write_csv(out_path, labels) as write_row: for row in rows: write_row(dict(zip(labels, row)))
def write_shs_verbal_indeclinables(adverbs_path, final_path, root_converter, out_path): """Write SHS verbal indeclinables.""" labels = None clean_rows = [] with util.read_csv(adverbs_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue row['root'] = root_pair[0] row['hom'] = root_pair[1] clean_rows.append(row) with util.read_csv(final_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue row['root'] = root_pair[0] row['hom'] = root_pair[1] # TODO: handle 'ya' gerunds if not row['form'].endswith('um'): continue clean_rows.append(row) labels = reader.fieldnames labels.insert(labels.index('root') + 1, 'hom') with util.write_csv(out_path, labels) as write_row: for row in clean_rows: write_row(row)
def write_shs_verbal_data(data_path, root_converter, out_path): """Write Sanskrit Heritage Site data after converting its roots. :param data_path: path to the actual verb data :param blacklist_path: path to a list of blacklisted roots :param override_path: path to a map from SHS roots to MW roots. If a root isn't in this map, assume the SHS roots are just fine. :param out_path: """ labels = None clean_rows = [] with util.read_csv(data_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue root, hom = root_pair row['root'] = root row['hom'] = hom clean_rows.append(row) labels = reader.fieldnames labels.insert(labels.index('root') + 1, 'hom') with util.write_csv(out_path, labels) as write_row: for row in clean_rows: write_row(row)
def run(): # ========= DATASET 1 ========= # filepath = "./output/Base-DT-DS1.csv" X_train, Y_train = util.load_csv(util.train_1_filepath) X_test, Y_test = util.load_csv(util.test_with_label_1_filepath) clf = tree.DecisionTreeClassifier(criterion="entropy") # Train clf = clf.fit(X_train, Y_train) # Test/Predict Y_pred = clf.predict(X_test) # Confusion Matrix confusion_matrix = metrics.confusion_matrix(Y_test, Y_pred) metrics.plot_confusion_matrix(clf, X_test, Y_test) # Evaluation classification_report = metrics.classification_report(Y_test, Y_pred) # Debug print print_debug(1, clf, Y_pred, confusion_matrix, classification_report) # Save util.write_csv(filepath, Y_test, Y_pred, confusion_matrix) # ========= DATASET 2 ========= # filepath = "./output/Base-DT-DS2.csv" X_train, Y_train = util.load_csv(util.train_2_filepath) X_test, Y_test = util.load_csv(util.test_with_label_2_filepath) clf = tree.DecisionTreeClassifier(criterion="entropy") # Train clf = clf.fit(X_train, Y_train) # Test/Predict Y_pred = clf.predict(X_test) # Confusion Matrix confusion_matrix = metrics.confusion_matrix(Y_test, Y_pred) metrics.plot_confusion_matrix(clf, X_test, Y_test) # Evaluation classification_report = metrics.classification_report(Y_test, Y_pred) # Debug print print_debug(2, clf, Y_pred, confusion_matrix, classification_report) # Save util.write_csv(filepath, Y_test, Y_pred, confusion_matrix) # DEBUG-------------------------------------------------------------------- #run()
def main(): data = util.read_csv('C:/test/csv/test.csv') new_data = [] row_idx = 0 col_idx = 0 for row in data: row_idx = row_idx + 1 new_row = [] col_idx = 0 for col in row: col_idx = col_idx + 1 s = str(row_idx) + ':' + str(col_idx) + '=' + col print(s) new_row.append(col + '_NEW') new_data.append(new_row) util.write_csv('C:/tmp/newcsv.csv', new_data)
def main(args): X = pd.read_csv(args.data) y = pd.read_csv(args.labels) X, y = SMOTE().fit_resample(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) svclassifier = SVC(kernel='linear') svclassifier.fit(X_train, y_train) y_pred = svclassifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) X_test = pd.read_csv(args.test_set) y_pred = svclassifier.predict(X_test) util.write_csv('predicted_svm.csv', np.transpose(np.array(y_pred, ndmin=2)))
def run(): f1 = h5py.File('image_features_4096.hdf5', 'r') f2 = h5py.File('image_arr_N_224_224_3.hdf5', 'r') X = f1['X'][()] y = f2['y'][:len(X)] model_path = 'model/rf_8000.joblib.pkl' reg = None if os.path.exists(model_path): reg = joblib.load(model_path) else: train_num = int(len(X) * 0.8) reg = RandomForestRegressor(n_estimators=400, n_jobs=50) print("Training ...") reg.fit(X[:train_num], y[:train_num]) _ = joblib.dump(reg, model_path) y_t = reg.predict(X[300000:]) print("MAE: {}".format(mean_absolute_error(y[300000:], y_t))) print("MSE: {}".format(mean_squared_error(y[300000:], y_t))) util.write_csv(y_t, y[300000:], 'result/rf_all.csv')
def export_to_csv(self, results: Collection[RepoVersionDiffs], repos: ResultMap[GitRepoPath], roles: ResultMap[GalaxyRole]) -> None: header_files = ('role id', 'role name', 'owner', 'repo', 'touched file', 'insertions', 'deletions', 'v1..v2') header_lines = ('role id', 'role name', 'owner', 'repo', 'insertions', 'deletions', 'v1..v2') header_commits = ('role id', 'role name', 'owner', 'repo', 'commit sha1', 'author name', 'author email', 'date', 'v1..v2') files: List[Tuple[str, str, str, str, str, int, int, str]] = [] lines: List[Tuple[str, str, str, str, int, int, str]] = [] commits: List[Tuple[str, str, str, str, str, str, str, int, str]] = [] for diffs in results: role = roles[diffs.id] for bump_diff in diffs.bumps: diff_id = bump_diff.id files.extend( ((role.id, role.name, role.github_user, role.github_repo, str(f.file_path), f.insertions, f.deletions, diff_id) for f in bump_diff.touched_files)) lines.append( (role.id, role.name, role.github_user, role.github_repo, bump_diff.insertions, bump_diff.deletions, diff_id)) commits.extend( ((role.id, role.name, role.github_user, role.github_repo, commit.sha1, commit.author_name, commit.author_email, commit.authored_date, diff_id) for commit in bump_diff.commits)) self.out.mkdir(exist_ok=True, parents=True) write_csv(self.out / 'commits.csv', header_commits, commits) write_csv(self.out / 'touched_files.csv', header_files, files) write_csv(self.out / 'touched_lines.csv', header_lines, lines)
def run_dataset(filepath_train, filepath_test, filepath_output): x_train, y_train = util.load_csv(filepath_train) x_test, y_test = util.load_csv(filepath_test) clf = Perceptron() y_pred = clf.fit(x_train,y_train).predict(x_test) train_accuracy = clf.score(x_train, y_train) test_accuracy = metrics.accuracy_score(y_test, y_pred) #confusion matrix cmatrix = metrics.confusion_matrix(y_test, y_pred) metrics.plot_confusion_matrix(clf, x_test, y_test) #evalution classification_report = metrics.classification_report(y_test, y_pred) #print to output file util.write_csv(filepath_output, y_test, y_pred, cmatrix) #print to console for debug purposes print_result(clf, train_accuracy, test_accuracy, y_pred, cmatrix, classification_report, filepath_output)
def run_dataset(filepath_train, filepath_test, filepath_output): x_train, y_train = util.load_csv(filepath_train) x_test, y_test = util.load_csv(filepath_test) gnb = GaussianNB() y_pred = gnb.fit(x_train, y_train).predict(x_test) train_accuracy = gnb.score(x_train, y_train) test_accuracy = metrics.accuracy_score(y_test, y_pred) #confusion matrix cmatrix = metrics.confusion_matrix(y_test, y_pred) metrics.plot_confusion_matrix(gnb, x_test, y_test) #evalution classification_report = metrics.classification_report(y_test, y_pred) #output file util.write_csv(filepath_output, y_test, y_pred, cmatrix) #Print result to console print_result(gnb, train_accuracy, test_accuracy, y_pred, cmatrix, classification_report, filepath_output)
def write_verb_prefixes(upasargas, other, out_path): with util.read_csv(upasargas) as reader: upasargas = list(reader) with util.read_csv(other) as reader: other = list(reader) labels = reader.fieldnames assert 'prefix_type' in labels for x in upasargas: assert 'prefix_type' not in x x['prefix_type'] = 'upasarga' rows = sorted(upasargas + other, key=lambda x: util.key_fn(x['name'])) with util.write_csv(out_path, labels) as write_row: for row in rows: write_row(row)
def write_prefixed_shs_verbal_indeclinables(final_path, sandhi_rules, prefixed_roots, root_converter, out_path): """Write prefixed SHS verbal indeclinables.""" sandhi = make_sandhi_object(sandhi_rules) root_to_prefixed = {} with util.read_csv(prefixed_roots) as reader: for row in reader: root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row) labels = None clean_rows = [] with util.read_csv(final_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue root, hom = root_pair row['root'] = root for result in root_to_prefixed.get(root, []): new_row = row.copy() for field in ['form', 'stem']: if field in row: new_row[field] = sandhi.join( result['prefixes'].split('-') + [new_row[field]]) new_row['root'] = result['prefixed_root'] new_row['hom'] = result['hom'] clean_rows.append(new_row) labels = reader.fieldnames labels += ['hom'] old_rows = list(util.read_csv_rows(out_path)) clean_rows.sort(key=lambda x: util.key_fn(x['root'])) with util.write_csv(out_path, labels) as write_row: for row in old_rows: write_row(row) for row in clean_rows: write_row(row)
def write_prefixed_shs_verbal_data(data_path, prefixed_roots, root_converter, sandhi_rules, out_path): """Write Sanskrit Heritage Site data after converting its roots. :param data_path: path to the actual verb data :param out_path: """ sandhi = make_sandhi_object(sandhi_rules) root_to_prefixed = {} with util.read_csv(prefixed_roots) as reader: for row in reader: root_to_prefixed.setdefault(row['unprefixed_root'], []).append(row) labels = None clean_rows = [] with util.read_csv(data_path) as reader: for row in reader: root_pair = root_converter.get(row['root']) if root_pair is None: continue root, hom = root_pair for result in root_to_prefixed.get(root, []): new_row = row.copy() for field in ['form', 'stem']: if field in row: new_row[field] = sandhi.join( result['prefixes'].split('-') + [new_row[field]]) new_row['root'] = result['prefixed_root'] new_row['hom'] = hom clean_rows.append(new_row) labels = reader.fieldnames + ['hom'] old_rows = list(util.read_csv_rows(out_path)) clean_rows.sort(key=lambda x: util.key_fn(x['root'])) with util.write_csv(out_path, labels) as write_row: for row in old_rows: write_row(row) for row in clean_rows: write_row(row)
def main(): global queue_size # Initialize Myo and create a Hub and our listener. myo.init(sdk_path='./myo_sdk/') hub = myo.Hub() listener = Listener(queue_size) def get_data(): global count global flg_get_data global all_data emgs = np.array([x[1] for x in listener.get_emg_data()]).T # print('emgs') # print(emgs) if emgs.shape == (8, queue_size): if count > DATANUM_TOTAL - 2: flg_get_data = False label_index = int(count / DATANUM_EACH) % LEBELS_NUM label = LABELS[label_index] label = np.array(label).astype('int32') print( f' label_index: {label_index} {LABELS[label_index]} {count+1}/{DATANUM_TOTAL}' ) count += 1 # print(type(emgs)) # print(emgs.shape) # print(emgs[0]) f = emgs m = np.mean(f, axis=1) v = np.var(f, axis=1) # m_norm = normalize(m) # print(m_norm) # print(m) # print(normalize(v)) # print(v) # print(sigmoid(v - np.mean(v))) # v_chg = sigmoid(v - np.mean(v)) m = np.mean(np.abs(f), axis=1) # print(np.mean(np.abs(f), axis=1)) F = np.fft.fft(f) Amp = np.abs(F) first_Amp = Amp[:, 0:int(queue_size / 2)] # size: 8*queue_size/2 flat_Amp = np.reshape(first_Amp, (1, int(8 * queue_size / 2)))[0] flat_Amp_norm = normalize(flat_Amp) # print(flat_Amp_norm) # size: len(label) + 8*queue_size/2 # save_data = np.hstack((label, flat_Amp)) save_data = np.hstack((label, m, flat_Amp_norm)) save_data = list(save_data) # print('save_data', save_data) # print(save_data) all_data = data_append(all_data, save_data) else: print("buffering") # print(emgs) try: threading.Thread( target=lambda: hub.run_forever(listener.on_event)).start() while flg_get_data: get_data() time.sleep(1) # save file print('saving data...') print('Do not remove Myo') print(np.array(all_data).shape) write_csv(all_data, SAVE_DATA_PATH) print('finish', SAVE_DATA_PATH) finally: hub.stop()
def write_output_csv(rows): if output_csv != None: ut.write_csv(rows, output_csv)
x_range = [-100.0, 100.0] N_train = int(args.Ntrain) N_test = int(args.Ntest) N = N_train + N_test # choose model parameters from the command line if args.beta == 0.0: beta = np.array(args.beta) beta = np.reshape(beta, (1, beta.shape[0])) else: # generate model parameters beta = s.get_model_parameters(beta_range, args.dim+1) # generate data X = s.generate_data(x_range, N, args.dim) # split data into train and test and normalize X_train, X_test = s.split_data(X, N_train) X_train, X_test = s.normalize_sets(X_train, X_test) # add bias X_train, X_test = s.add_bias(X_train, X_test) Y_train = s.get_labels(X_train, beta) Y_test = s.get_labels(X_test, beta) util.write_csv('./beta_{0}.csv'.format(args.dim), 'beta', beta, args.precision) util.write_csv('./X_train.csv', 'x', X_train, args.precision) util.write_csv('./Y_train.csv', 'y', Y_train, args.precision) util.write_csv('./X_test.csv', 'x', X_test, args.precision) util.write_csv('./Y_test.csv', 'y', Y_test, args.precision)
def main(args): # Set revcomp parameter. if args.r != 1: args.r = False elif args.r == 1 and args.alphabet != 'DNA': print("Error, the -r parameter can only be used in DNA.") elif args.r == 1 and args.alphabet == 'DNA': args.r = True # Set alphabet parameter. if args.alphabet == 'DNA': args.alphabet = index_list.DNA elif args.alphabet == 'RNA': args.alphabet = index_list.RNA elif args.alphabet == 'Protein': args.alphabet = index_list.PROTEIN if args.method.upper() == 'KMER': if args.k is None: print "parameters k is required. The default value of k is 2." args.k = 2 if args.r is None: print "parameters r is required. The default value of r is 0." args.r = 0 res = make_kmer_vector(k=args.k, alphabet=args.alphabet, filename=args.inputfile, revcomp=args.r) elif args.method.upper() == 'IDKMER': if args.k is None: print "parameters k is required. The default value of k is 6." args.k = 6 if args.ps is None or args.ns is None: print 'The positive and the negative source files are required.' return False res = idkmer(k=args.k, filename=args.inputfile, pos_src_name=args.ps, neg_src_name=args.ns) elif args.method.upper() == "MISMATCH": if args.k is None: print "parameters k is required. The default value of k is 3." args.k = 3 if args.m is None: print "parameters m is required. The default value of m is 1." args.m = 1 if args.m >= args.k: print "parameters m should be less than parameter k." else: res = getMismatchProfileMatrix(args.inputfile, args.alphabet, args.k, args.m) elif args.method.upper() == "SUBSEQUENCE": if args.delta is None: print "parameters delta is required. The default value of delta is 1." args.delta = 1 elif args.delta > 1 or args.delta < 0: print "delta should be greater than or equal to 0 and less than or equal to 1." if args.k is None: print "parameters k is required. The default value of k is 3." args.k = 3 res = getSubsequenceProfileByParallel(filename=args.inputfile, alphabet=args.alphabet, k=args.k, delta=args.delta) elif args.method.upper() == 'DR': if args.alphabet != index_list.PROTEIN: print 'DR method is only available for Protein.' return False elif args.max_dis < 0 or args.max_dis > 10: print 'The max distance can not be negative integer and should be smaller than 11.' return False else: res = dr_method(inputfile=args.inputfile, max_dis=args.max_dis) print res elif args.method.upper() == 'DP': if args.alphabet != index_list.PROTEIN: print 'Distance Pair method is only available for Protein.' return False elif args.max_dis < 0 or args.max_dis > 10: print 'The max distance can not be negative integer and should be smaller than 11.' return False else: if args.cp == 'cp_13': reduce_alphabet_scheme = const.cp_13 elif args.cp == 'cp_14': reduce_alphabet_scheme = const.cp_14 elif args.cp == 'cp_19': reduce_alphabet_scheme = const.cp_19 elif args.cp == 'cp_20': reduce_alphabet_scheme = const.cp_20 res = get_pseaacdis_matrix(filename=args.inputfile, reduce_alphabet_scheme=reduce_alphabet_scheme, max_distance=args.max_dis, alphabet=args.alphabet) else: print("Method error!") # Write correspond res file. if args.f == 'svm': if args.multi == 0 and args.l is None: args.l = '+1' elif args.multi == 0 and (args.l != '+1' and args.l != '-1'): print "For binary classification, the label should be either '+1' or '-1'." return False elif args.multi == 1 and args.l is None: args.l = '0' elif args.multi == 1 and args.l is not None: try: label = int(args.l) except ValueError: print 'The labels should be integer.' return False from util import write_libsvm write_libsvm(res, [args.l] * len(res), args.outputfile) elif args.f == 'tab': from util import write_tab write_tab(res, args.outputfile) elif args.f == 'csv': from util import write_csv write_csv(res, args.outputfile)
def write_prefix_groups(prefixed_roots, unprefixed_roots, upasargas, other, sandhi_rules, out_path): """Parse the prefixes in a prefix root and write out the prefix groups. The procedure is roughly as follows: for each prefixed root in `prefixed_roots`: find (p_1, ..., p_n, r), where p_x is a prefix and r is a root write the prefix group (p_1, ..., p_n) to file. We find (p_1, .., p_n) by using the rules in `sandhi_rules` and verify that `p_x` is a prefix by checking for membership in `upasargas` and `other`. """ # Loading prefixes all_prefixes = set() with util.read_csv(upasargas) as reader: all_prefixes.update([x['name'] for x in reader]) with util.read_csv(other) as reader: all_prefixes.update([x['name'] for x in reader]) # The 's' prefix is used in roots like 'saMskf' and 'parizkf'. Although it # is prefixed to a verb, it is not semantically the same as the other verb # prefixes. Here, though, we treat it as a verb prefix. all_prefixes.add('s') # Some prefixes have alternate forms. prefix_alternates = { 'pi': 'api', 'ut': 'ud', 'Ri': 'ni', 'niz': 'nis', 'iz': 'nis', 'palA': 'parA', 'pali': 'pari', 'z': 's', } all_prefixes.update(prefix_alternates.keys()) # Loading sandhi rules sandhi = make_sandhi_object(sandhi_rules) with util.read_csv(prefixed_roots) as reader: rows = [] for row in reader: # Nibble away at `prefixed_root` until we have all prefixes for the # given root. prefixes = [] prefixed_root = row['prefixed_root'] unprefixed_root = row['unprefixed_root'] last_letter = None q = Queue.PriorityQueue() for remainder in sandhi.split_off(prefixed_root, unprefixed_root): q.put_nowait((0, (), remainder)) while not q.empty(): _, cur_prefixes, remainder = q.get_nowait() # `remainder` is something we recognize: we're done! if remainder in all_prefixes: prefixes = list(cur_prefixes) if remainder: prefixes.append(remainder) last_letter = remainder[-1] break for before, after in sandhi.splits(remainder): # Prevent recursion. As of this comment, the `splits` method # returns the non-split of some term X as (X, ''). In other # words, this conditional will *never* be true. But since the # behavior of various functions is still unsettled, this check # will stay here for the time being. if after == remainder: continue if before in all_prefixes: state = (cur_prefixes + (before, ), after) cost = len(after) # Incentivize short vowels. This avoids errors with roots # like "upodgrah" ("upa-ud-grah"). Without the incentive, # we could have "upa-A-ud-grah" instead. if before and before[-1] in 'aiufx': cost -= 1 q.put_nowait((cost, ) + state) # Convert 'alternate' prefixes back to their original forms. prefixes = [prefix_alternates.get(x, x) for x in prefixes] if not prefixes: # Occurs if the root's prefix is unrecognized continue # We still don't know the prefix group. We can find it by splitting # off the root and keeping whatever matches `last_letter`. for group in sandhi.split_off(prefixed_root, unprefixed_root): if group[-1] == last_letter: break prefix_string = '-'.join(prefixes) rows.append((group, prefix_string)) labels = ['group', 'prefixes'] with util.write_csv(out_path, labels) as write_row: for row in util.unique(rows): datum = dict(zip(labels, row)) write_row(datum)
#! /usr/bin/env python3 import re import util URL = "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes?action=raw" CACHE = "article.wiki" COLUMNS = ["Code", "Message"] def scrape(): for line in open(util.get_cache_file(CACHE, URL)): m = re.match(r"^;\{\{.*?\}\}(\d{3}) (.*?)\s*$", line) if m: yield m.group(1), m.group(2).replace("[[", "").replace("]]", "") if __name__ == "__main__": util.write_csv(COLUMNS, scrape())