def cli(pass_folder, fail_folder, test_folder, refseq_database, ref_folder): # Extract features for pass data, fail data, and reference data if it hasn't already been done. if not os.path.isfile(os.path.join(fail_folder, 'extracted_features.csv')): extract_features.main(sequencepath=fail_folder, refseq_database=refseq_database, report=True) if not os.path.isfile(os.path.join(pass_folder, 'extracted_features.csv')): extract_features.main(sequencepath=pass_folder, refseq_database=refseq_database, report=True) if not os.path.isfile(os.path.join(ref_folder, 'extracted_features.csv')): extract_features.main(sequencepath=ref_folder, refseq_database=refseq_database, report=True) # Combine the dataframes for training data so that we can fit our decision tree. df = combine_csv_files(fail_folder=fail_folder, pass_folder=pass_folder, ref_folder=ref_folder) dt = fit_model(df) # Extract features for our test set if it hasn't already been done and attempt to predict results. if not os.path.isfile(os.path.join(test_folder, 'extracted_features.csv')): extract_features.main(sequencepath=test_folder, refseq_database=refseq_database, report=True) predict_results(test_folder, dt, df) # TODO: Add check that FASTA folder actuall has stuff in it. pickle.dump(dt, open('model.p', 'wb')) pickle.dump(df, open('dataframe.p', 'wb'))
def main(): print("Question 1:") print("The question 2 and 3 will be showed after close image window") print("-----------------------------") extract_features.main() print("Question 2:") print("-----------------------------") camera_calibration.main() print("Question 3:") print("-----------------------------") print("noise_version1") RANSAC.main("data/noise_version1.txt") print("-----------------------------") print("noise_version2") RANSAC.main("data/noise_version2.txt")
def get_bio_bert_embedding(): biobert_data = main() data = [json.loads(line) for line in biobert_data] sentence_vectors = [] token_vectors = [] for data_point in data: sentence_vector, token_vector = get_bio_bert_embedding_helper( data_point) sentence_vectors.append(sentence_vector) token_vectors.append(token_vector) return sentence_vectors, token_vectors
def exctract_features_from_midi_files(input_folder): """ Exctract features from midi files """ if not os.path.isdir(os.path.join(input_folder, 'GENERATED', 'midi')): os.mkdir(os.path.join(input_folder, 'GENERATED')) os.mkdir(os.path.join(input_folder, 'GENERATED', 'midi')) #Moves files source = input_folder dest = os.path.join(source, 'GENERATED', 'midi') files = os.listdir(source) for f in files: for retry in range(100): try: os.rename(os.path.join(source, f), os.path.join(dest, f)) break except: print("rename failed, retrying...") if not os.path.isdir(os.path.join(input_folder, 'GENERATED', 'features')): os.mkdir(os.path.join(input_folder, 'GENERATED', 'features')) ef.main(input_folder)
def main(review_list): # Getting the complete list complete_list = get_complete_list.main(review_list) # Extracting features feature_set = extract_features.main(complete_list) # Getting list of each review and its corresponding phrases review_phrase_dict_list = rules.main(complete_list,feature_set) # Gettng polarity for each feature and its dedicated phrases feature_phrase_polarity, phrase_polarity, phrase_feature = polarity.main(review_phrase_dict_list,feature_set) # Getting final polarity of each feature final_polarity = polarity.final_polarity(feature_phrase_polarity) return final_polarity, review_phrase_dict_list, phrase_polarity, phrase_feature
ii = i * 100 lat = latArray[ii, 0] lon = lonArray[ii, 0] sample_date = datArray[ii, 0] cnt = cntArray[ii, 0] print('Count = ' + str(cnt)) #h5name = '/Users/csprh/Dlaptop/MATLAB/MYCODE/HAB/WORK/HAB/florida2/Cube_09073_09081_737173.h5' #outputDirectory = '/Users/csprh/Dlaptop/MATLAB/MYCODE/HAB/WORK/HAB/CNNIms' #h5name = '/home/cosc/csprh/linux/HABCODE/scratch/HAB/tmpTest/testCubes/Cube_09073_09081_737173.h5' #mstringApp = '/Applications/MATLAB_R2016a.app/bin/matlab' h5name = '/home/cosc/csprh/linux/HABCODE/scratch/HAB/tmpTest/testCubes/Cube_Test.h5' outputDirectory = '/home/cosc/csprh/linux/HABCODE/scratch/HAB/tmpTest/CNNIms' mstringApp = 'matlab' # GENERATE DATACUBE FROM LAT, LON, DATE (not necessary if you already have datacube). mstring = mstringApp + ' -nosplash -r \"genSingleH5sWrapper ' + str( lat) + ' ' + str(lon) + ' ' + str( sample_date) + ' ' + h5name + '\;quit;"' os.system(mstring) # GENERATE IMAGES FROM DATA CUBE mstring = mstringApp + ' -nosplash -r \"outputImagesFromDataCubeScript ' + h5name + ' ' + outputDirectory + '\;quit;"' os.system(mstring) # EXTRACT BOTTLENECK FEATURES FROM IMAGES extract_features.main(['cnfgXMLs/NASNet11_lstm0.xml', outputDirectory]) # GENERATE CLASSIFICATION FROM BOTTLENECK FEATURES AND TRAINED MODEL testHAB.main(['cnfgXMLs/NASNet11_lstm0.xml', outputDirectory])
# normalize the embeddings sentence_vector = [[element / number_of_tokens for element in elem] for elem in sentence_vector] return sentence_vector, token_vector def pickle_dumper(dataX, dataY, split): # Change the path each time path = embedding_dir + "Chemicals/biosemantics/" with open(os.path.join(path, "embds_" + split + ".pickle"), "wb") as fp: pickle.dump(dataX, fp) with open(os.path.join(path, "tokens_" + split + ".pickle"), "wb") as fp: pickle.dump(dataY, fp) biobert_data = main() # use large amount of memory data = [json.loads(line) for line in biobert_data] sentence_vectors = [] token_vectors = [] for data_point in data: sent_vec, token_vec = get_bio_bert_embedding_helper(data_point) sentence_vectors.append(sent_vec) token_vectors.append(token_vec) pickle_dumper(sentence_vectors, token_vectors, "train")
setup_tokenizers(('.', ';')) @textual_feature('words', 'ancient_greek') #Using 'words' makes the input 'file' parameter become a list of words def num_conjunctions(file): #parameter must be the text of a file return reduce(lambda count, word: count + (1 if word in {normalize('NFD', val) for val in ['καί', 'καὶ', 'ἀλλά', 'ἀλλὰ', 'ἤ', 'ἢ']} else 0), file, 0) @textual_feature('sentences', 'ancient_greek') #Using 'sentences' makes the input 'file' parameter become a list of sentences def mean_sentence_length(file): #parameter must be the text of a file return reduce(lambda count, sentence: count + len(sentence), file, 0) / len(file) @textual_feature() #Not putting any decorator parameters will leave the input 'file' parameter unchanged as a string of text def num_interrogatives(file): #parameter must be the text of a file return file.count(';') extract_features.main(corpus_dir='demo_files', file_extension='tess', output_file=os.path.join('demo_files', 'output.pickle')) ''' Extracting features from .tess files in demo_files Progress |███████████████████████████████████████████| 100.0% (4 of 4 files) Feature mining complete. Attempting to write feature results to "demo_files/output.pickle"... Success! Elapsed time: 1.262120753992349 ''' #************************************************************************************************************************ ''' 2) Train & Test machine learning models on the features Use the "@model_analyzer()" decorator to label functions that analyze machine learning models
def execute(node, previous, experiment_folder): """ Execute a task defined by the given node in the experiment graph. Parameters ---------- node : Element The node to be executed. previous : dict (or list of dict) Dictionary of the experiment's running-time variables after the end of the parent node's execution. May be a list of dictionaries in the special case of a fusion node, which has more than one parent. experiment_folder : string String with the path to the experiment folder, where the files of the experiment will be saved. Returns ------- exp_param : dict The updated dictionary of the experiment's running-time variables after the node's execution. """ global execution_time global tex_path global tex_dict global openset_experiment exp_param = previous parameters = ast.literal_eval(node.get("parameters")) node_id = node.attrib['id'] #Get node name node_name = node.get('name') if node.tag == "collection": print "Collection", exp_param.keys() images, classes, extract_path, read_time = \ read_collection.main(node_name, openset_experiment, parameters, node_id) execution_time += read_time exp_param['images'] = images exp_param['classes'] = classes exp_param['extract_path'] = extract_path elif node.tag == "train_test_method": print "train_test_method", exp_param.keys() images = exp_param['images'] classes = exp_param['classes'] images, classes, train_test_list, train_test_time = \ train_test.main(images, classes, experiment_folder, node_name, parameters, openset_experiment, node_id) execution_time += train_test_time exp_param['images'] = images exp_param['classes'] = classes exp_param['train_test_list'] = train_test_list exp_param['train_test_method'] = node_name exp_param['train_test_parameters'] = parameters elif node.tag == "descriptor": print "descriptor", exp_param.keys() images = exp_param['images'] extract_path = exp_param['extract_path'] classes_keys = exp_param['classes'].keys() if node_name == "bag": train_test_list = exp_param['train_test_list'] images, extract_time = extract_bag.main(images, train_test_list, extract_path, experiment_folder, parameters, node_id) elif node_name == "bovg": train_test_list = exp_param['train_test_list'] images, extract_time = extract_bovg.main(images, train_test_list, extract_path, experiment_folder, parameters, node_id) else: images, extract_time = extract_features.main( images, classes_keys, extract_path, node_name, parameters, node_id) execution_time += extract_time exp_param['images'] = images exp_param['descriptor'] = node_name elif node.tag == "normalizer": try: manager = Manager() images = manager.dict(exp_param['images']) train_test_list = exp_param['train_test_list'] except: print "\n\tMissing Input. Exiting." sys.exit(1) norm_fv_paths, normalize_time = normalize_features.main( images, train_test_list, experiment_folder, node_name, parameters, node_id) execution_time += normalize_time del exp_param['images'] exp_param['fv_paths'] = norm_fv_paths elif node.tag == "classifier": try: classes = exp_param['classes'] train_test_list = exp_param['train_test_list'] descriptor = exp_param['descriptor'] try: fv_paths = exp_param['fv_paths'] del exp_param['fv_paths'] except: images = exp_param['images'] fv_paths = util.save_file_extract(images, train_test_list, experiment_folder) except: print "\n\tMissing Input. Exiting." sys.exit(1) images, classes_list, classify_time = classify.main( fv_paths, classes.keys(), train_test_list, experiment_folder, node_name, parameters, descriptor, node_id) execution_time += classify_time exp_param['images'] = images exp_param['classes_list'] = classes_list elif node.tag == "fusion_method": len_exp_param = len(exp_param) #list with the images dictionaries, classes dictionaries, and train and # test set list list_images = [] list_classes = [] list_train_test = [] extract_path = exp_param[INDEX_ZERO]['extract_path'] for index in range(len_exp_param): try: list_images.append(exp_param[index]['images']) except: images = {} for fv_path in exp_param[index]['fv_paths']: print "fv_path:", fv_path images_new = util.read_fv_file(fv_path) images = util.merge_dict(images, images_new) list_images.append(images) list_classes.append(exp_param[index]['classes']) #In case that it performs the fusion of collections, there is no # train_test_list try: list_train_test.append(exp_param[index]['train_test_list']) except: list_train_test.append(None) #classes_list is present only after the classification module try: classes_list = exp_param[INDEX_ZERO]['classes_list'] except: classes_list = None try: train_test_method = exp_param[INDEX_ZERO]['train_test_method'] train_test_parameters = exp_param[INDEX_ZERO][ 'train_test_parameters'] except: train_test_method = None train_test_parameters = None images, classes, train_test_list, fusion_time = \ fusion.main(list_images, list_classes, list_train_test, classes_list, experiment_folder, node_name, parameters, node_id) execution_time += fusion_time exp_param = {} exp_param['images'] = images exp_param['classes'] = classes if train_test_list is not None: exp_param['train_test_list'] = train_test_list if classes_list is not None: exp_param['classes_list'] = classes_list if train_test_method is not None: exp_param['train_test_method'] = train_test_method exp_param['train_test_parameters'] = train_test_parameters exp_param['descriptor'] = None exp_param['extract_path'] = extract_path elif node.tag == "evaluation_measure": try: images = exp_param['images'] train_test_list = exp_param['train_test_list'] classes_list = exp_param['classes_list'] except: print "\n\tMissing Input. Exiting." sys.exit(1) evaluation_time, evaluation_path = evaluation.main( images, train_test_list, classes_list, experiment_folder, node_name, parameters, node_id) execution_time += evaluation_time #Dictionaries to create the tex file train_test_method = exp_param['train_test_method'] train_test_parameters = str(exp_param['train_test_parameters']) if train_test_method not in tex_dict: tex_dict[train_test_method] = {} train_test_dict = tex_dict[train_test_method] if train_test_parameters not in train_test_dict: train_test_dict[train_test_parameters] = {} output_dict = train_test_dict[train_test_parameters] if node_name not in output_dict: output_dict[node_name] = [] list_output = [evaluation_path, classes_list[0], node_id] if list_output not in output_dict[node_name]: output_dict[node_name].append(list_output) train_test_dict[train_test_parameters] = output_dict tex_dict[train_test_method] = train_test_dict elif node.tag == "preprocessing": images = exp_param['images'] classes = exp_param['classes'] images, classes, preprocessing_time = preprocessing.main( images, classes, experiment_folder, node_name, parameters, node_id) execution_time += preprocessing_time exp_param['images'] = images exp_param['classes'] = classes else: print "Error. Unknown Tag." sys.exit(1) return exp_param
str(categories_to_include - genre_to_files.keys())) #https://stackoverflow.com/a/13738951/7102572 corpus_dir = 'grc' if not os.path.isdir(corpus_dir): try: cmd = 'svn' proc = subprocess.run([ cmd, 'export', 'https://github.com/timgianitsos/tesserae/trunk/texts/grc' ]) proc.check_returncode() except OSError as e: print(f'Your system may not have "{cmd}" installed') raise e except subprocess.CalledProcessError as e: raise e #Feature extractions extract_features.main( corpus_dir, 'tess', #Exclude all files of genres not specified. Exclude composite files no matter what excluded_paths=composite_files | (set() if len(sys.argv) <= 2 else reduce( lambda cur_set, next_set: cur_set | next_set, (genre_to_files[tok] for tok in genre_to_files if tok not in categories_to_include), set())), output_file=None if len(sys.argv) <= 1 else sys.argv[1])
import greek_features #seemingly unused here, but this makes the environment recognize features import extract_features from corpus_categories import composite_files, verse_misc_files, prose_files import os import sys if __name__ == '__main__': #Download corpus if non-existent corpus_dir = os.path.join('tesserae', 'texts', 'grc') tesserae_clone_command = 'git clone https://github.com/timgianitsos/tesserae.git' if not os.path.isdir(corpus_dir): print(RED + 'Corpus at ' + corpus_dir + ' does not exist - attempting to clone repository...' + RESET) if os.system(tesserae_clone_command) is not 0: raise Exception('Unable to obtain corpus for feature extraction') #Feature extractions extract_features.main( corpus_dir, 'tess', #Exclude the following directories and files excluded_paths=composite_files | verse_misc_files | prose_files, #Output the results to a file to be processed by machine learning algorithms output_file=None if len(sys.argv) <= 1 else sys.argv[1])
OUTPUT_DIR = 'gs://{}/bert/models/{}'.format(BUCKET, TASK) tf.gfile.MakeDirs(OUTPUT_DIR) print('***** Model output directory: {} *****'.format(OUTPUT_DIR)) # Download the data. ! wget https://competitions.codalab.org/my/datasets/download/69a3e8d0-b836-48b8-8795-36a6865a1c04 ! unzip 69a3e8d0-b836-48b8-8795-36a6865a1c04 ! rm 69a3e8d0-b836-48b8-8795-36a6865a1c04 ! mv data.tsv data/data.tsv ! mv eval1_unlabelled.tsv data/testdata.tsv # create a smaller dummy data. datafile_name = "data/data.tsv" smalldb = pd.read_csv(datafile_name, chunksize=10000) trainingSet, testSet = train_test_split(smalldb, test_size=0.1) trainingSet.to_csv("data/traindata.tsv", sep='\t') testSet.to_csv("data/validationdata.tsv", sep='\t') ## using tokenization !rm -rf msaic !git clone https://vaibhavgeek:[email protected]/vaibhavgeek/msaic.git import sys sys.path += ["msaic/bert-repo"] import extract_features extract_features.main("data/traindata.tsv" , "bert-msaic")
#mstringApp = '/Applications/MATLAB_R2016a.app/bin/matlab' xmlName = '/home/cosc/csprh/linux/HABCODE/code/HAB/extractData/configHABunderDesk.xml' mstringApp = 'matlab' tree = et.parse(xmlName) tree.find('.//testDate').text = sample_date_string imsDir = tree.find('.//testImsDir').text tree.write(xmlName) imsDir = os.path.join(imsDir, sample_date_string) modelD = os.getcwd() os.chdir('../extractData') # GENERATE DATACUBES FOR A BUNCH OF LAT, LON POSITIONS IN A GRID mstring = mstringApp + ' -nosplash -r \"test_genAllH5s; quit;\"' os.system(mstring) os.chdir('postProcess') # GENERATE IMAGES FROM DATA CUBES # GENERATED LAT AND LONS latLonList.txt TEXT FILE IN imsDir mstring = mstringApp + ' -nosplash -r \"test_cubeSequence; quit;\"' os.system(mstring) os.chdir(modelD) # EXTRACT BOTTLENECK FEATURES FROM IMAGES extract_features.main(['cnfgXMLs/NASNet11_lstm0.xml', imsDir]) # GENERATE CLASSIFICATION FROM BOTTLENECK FEATURES AND TRAINED MODEL # GENERATED CLASSIFICATIONS ENTERED INTO classesProbs.txt TEXT FILE IN imsDir testHAB.main(['cnfgXMLs/NASNet11_lstm0.xml', imsDir])
import tensorflow as tf import numpy as np import extract_features as ef X_train,Y_train,X_test,Y_test=ef.main() print("Feature Extraction Completed!") nx=X_train.shape[0] ny=Y_train.shape[0] m_train=X_train.shape[1] m_test=X_test.shape[1] net_arch=[nx, 15, 10, ny] num_layers=len(net_arch)-1 def init_params(): parameters={} for i in range(num_layers): parameters['W'+str(i+1)]=tf.get_variable('W'+str(i+1), shape=[net_arch[i+1],net_arch[i]], initializer=tf.contrib.layers.xavier_initializer(seed = 1)) parameters['b'+str(i+1)]=tf.get_variable('b'+str(i+1), shape=[net_arch[i+1],1], initializer=tf.contrib.layers.xavier_initializer(seed = 1)) return parameters def create_network(A, parameters): for i in range(num_layers): W=parameters['W'+str(i+1)] b=parameters['b'+str(i+1)] if(i<num_layers-1): A=tf.nn.relu(tf.matmul(W,A)+b) else: A=tf.matmul(W,A)+b return A
# print(tensors) embeddings, labels, seq_len = db.find_entities(sents) print(labels) seqwrite = open('sequencesave.txt', mode='w') seqwrite.write(str(seq_len)) seqwrite.close() # print(embeddings) # program.train_model(embeddings, labels, seq_len) program.train_bidirectional(embeddings, labels, seq_len) elif choice == '1': model = models.load_model("SparseEntity.model") sentence = input('Input: ') sentence = sentence.replace("\n", "") writef = open('predictionfile.txt', mode='w') writef.write(sentence) writef.close() infile = open('predictionfile.txt').read() print(infile) ef.main('predictionfile.txt') tensor, sl = ef.build_tensors() # print(tensor) prediction = program.predict(model, tensor) print(prediction) prediction = prediction[0].tolist() print(max(prediction)) print(prediction.index(max(prediction)))