def cli(pass_folder, fail_folder, test_folder, refseq_database, ref_folder):
    # Extract features for pass data, fail data, and reference data if it hasn't already been done.
    if not os.path.isfile(os.path.join(fail_folder, 'extracted_features.csv')):
        extract_features.main(sequencepath=fail_folder,
                              refseq_database=refseq_database,
                              report=True)
    if not os.path.isfile(os.path.join(pass_folder, 'extracted_features.csv')):
        extract_features.main(sequencepath=pass_folder,
                              refseq_database=refseq_database,
                              report=True)
    if not os.path.isfile(os.path.join(ref_folder, 'extracted_features.csv')):
        extract_features.main(sequencepath=ref_folder,
                              refseq_database=refseq_database,
                              report=True)

    # Combine the dataframes for training data so that we can fit our decision tree.
    df = combine_csv_files(fail_folder=fail_folder, pass_folder=pass_folder, ref_folder=ref_folder)
    dt = fit_model(df)

    # Extract features for our test set if it hasn't already been done and attempt to predict results.
    if not os.path.isfile(os.path.join(test_folder, 'extracted_features.csv')):
        extract_features.main(sequencepath=test_folder,
                              refseq_database=refseq_database,
                              report=True)
    predict_results(test_folder, dt, df)  # TODO: Add check that FASTA folder actuall has stuff in it.
    pickle.dump(dt, open('model.p', 'wb'))
    pickle.dump(df, open('dataframe.p', 'wb'))
Exemple #2
0
def main():
    print("Question 1:")
    print("The question 2 and 3 will be showed after close image window")
    print("-----------------------------")
    extract_features.main()
    print("Question 2:")
    print("-----------------------------")
    camera_calibration.main()
    print("Question 3:")
    print("-----------------------------")
    print("noise_version1")
    RANSAC.main("data/noise_version1.txt")
    print("-----------------------------")
    print("noise_version2")
    RANSAC.main("data/noise_version2.txt")
def get_bio_bert_embedding():
    biobert_data = main()
    data = [json.loads(line) for line in biobert_data]

    sentence_vectors = []
    token_vectors = []
    for data_point in data:
        sentence_vector, token_vector = get_bio_bert_embedding_helper(
            data_point)
        sentence_vectors.append(sentence_vector)
        token_vectors.append(token_vector)

    return sentence_vectors, token_vectors
def exctract_features_from_midi_files(input_folder):
    """
    Exctract features from midi files
    """
    if not os.path.isdir(os.path.join(input_folder, 'GENERATED', 'midi')):
        os.mkdir(os.path.join(input_folder, 'GENERATED'))
        os.mkdir(os.path.join(input_folder, 'GENERATED', 'midi'))
        #Moves files
        source = input_folder
        dest = os.path.join(source, 'GENERATED', 'midi')
        files = os.listdir(source)
        for f in files:
            for retry in range(100):
                try:
                    os.rename(os.path.join(source, f), os.path.join(dest, f))
                    break
                except:
                    print("rename failed, retrying...")

    if not os.path.isdir(os.path.join(input_folder, 'GENERATED', 'features')):
        os.mkdir(os.path.join(input_folder, 'GENERATED', 'features'))

    ef.main(input_folder)
def main(review_list):
	# Getting the complete list
	complete_list = get_complete_list.main(review_list)

	# Extracting features
	feature_set = extract_features.main(complete_list)

	# Getting list of each review and its corresponding phrases
	review_phrase_dict_list = rules.main(complete_list,feature_set)

	# Gettng polarity for each feature and its dedicated phrases
	feature_phrase_polarity, phrase_polarity, phrase_feature = polarity.main(review_phrase_dict_list,feature_set)

	# Getting final polarity of each feature
	final_polarity = polarity.final_polarity(feature_phrase_polarity)
	return final_polarity, review_phrase_dict_list, phrase_polarity, phrase_feature
Exemple #6
0
    ii = i * 100
    lat = latArray[ii, 0]
    lon = lonArray[ii, 0]
    sample_date = datArray[ii, 0]
    cnt = cntArray[ii, 0]
    print('Count = ' + str(cnt))
    #h5name = '/Users/csprh/Dlaptop/MATLAB/MYCODE/HAB/WORK/HAB/florida2/Cube_09073_09081_737173.h5'
    #outputDirectory = '/Users/csprh/Dlaptop/MATLAB/MYCODE/HAB/WORK/HAB/CNNIms'
    #h5name = '/home/cosc/csprh/linux/HABCODE/scratch/HAB/tmpTest/testCubes/Cube_09073_09081_737173.h5'
    #mstringApp = '/Applications/MATLAB_R2016a.app/bin/matlab'

    h5name = '/home/cosc/csprh/linux/HABCODE/scratch/HAB/tmpTest/testCubes/Cube_Test.h5'
    outputDirectory = '/home/cosc/csprh/linux/HABCODE/scratch/HAB/tmpTest/CNNIms'
    mstringApp = 'matlab'

    # GENERATE DATACUBE FROM LAT, LON, DATE (not necessary if you already have datacube).
    mstring = mstringApp + ' -nosplash -r \"genSingleH5sWrapper ' + str(
        lat) + ' ' + str(lon) + ' ' + str(
            sample_date) + ' ' + h5name + '\;quit;"'
    os.system(mstring)

    # GENERATE IMAGES FROM DATA CUBE
    mstring = mstringApp + ' -nosplash -r \"outputImagesFromDataCubeScript ' + h5name + ' ' + outputDirectory + '\;quit;"'
    os.system(mstring)

    # EXTRACT BOTTLENECK FEATURES FROM IMAGES
    extract_features.main(['cnfgXMLs/NASNet11_lstm0.xml', outputDirectory])

    # GENERATE CLASSIFICATION FROM BOTTLENECK FEATURES AND TRAINED MODEL
    testHAB.main(['cnfgXMLs/NASNet11_lstm0.xml', outputDirectory])
Exemple #7
0
    # normalize the embeddings
    sentence_vector = [[element / number_of_tokens for element in elem]
                       for elem in sentence_vector]

    return sentence_vector, token_vector


def pickle_dumper(dataX, dataY, split):

    # Change the path each time
    path = embedding_dir + "Chemicals/biosemantics/"
    with open(os.path.join(path, "embds_" + split + ".pickle"), "wb") as fp:
        pickle.dump(dataX, fp)
    with open(os.path.join(path, "tokens_" + split + ".pickle"), "wb") as fp:
        pickle.dump(dataY, fp)


biobert_data = main()  # use large amount of memory

data = [json.loads(line) for line in biobert_data]

sentence_vectors = []
token_vectors = []
for data_point in data:
    sent_vec, token_vec = get_bio_bert_embedding_helper(data_point)
    sentence_vectors.append(sent_vec)
    token_vectors.append(token_vec)

pickle_dumper(sentence_vectors, token_vectors, "train")
setup_tokenizers(('.', ';'))

@textual_feature('words', 'ancient_greek') #Using 'words' makes the input 'file' parameter become a list of words
def num_conjunctions(file): #parameter must be the text of a file
	return reduce(lambda count, word: count + (1 if word in {normalize('NFD', val) for val in ['καί', 'καὶ', 'ἀλλά', 'ἀλλὰ', 'ἤ', 'ἢ']} else 0), file, 0)

@textual_feature('sentences', 'ancient_greek') #Using 'sentences' makes the input 'file' parameter become a list of sentences
def mean_sentence_length(file): #parameter must be the text of a file
	return reduce(lambda count, sentence: count + len(sentence), file, 0) / len(file)

@textual_feature() #Not putting any decorator parameters will leave the input 'file' parameter unchanged as a string of text
def num_interrogatives(file): #parameter must be the text of a file
	return file.count(';')


extract_features.main(corpus_dir='demo_files', file_extension='tess', output_file=os.path.join('demo_files', 'output.pickle'))
'''
Extracting features from .tess files in demo_files
Progress |███████████████████████████████████████████| 100.0% (4 of 4 files)
Feature mining complete. Attempting to write feature results to "demo_files/output.pickle"...
Success!


Elapsed time: 1.262120753992349
'''

#************************************************************************************************************************
'''
2) Train & Test machine learning models on the features

Use the "@model_analyzer()" decorator to label functions that analyze machine learning models
Exemple #9
0
def execute(node, previous, experiment_folder):
    """
    Execute a task defined by the given node in the experiment graph.
    
    Parameters
    ----------
    
    node : Element
        The node to be executed.
        
    previous : dict (or list of dict)
        Dictionary of the experiment's running-time variables after the
        end of the parent node's execution.
        May be a list of dictionaries in the special case of a fusion node,
        which has more than one parent.
    
    experiment_folder : string
        String with the path to the experiment folder, where the files of the
        experiment will be saved.
        
    Returns
    -------
    
    exp_param : dict
        The updated dictionary of the experiment's running-time variables after
        the node's execution.
    
    """

    global execution_time
    global tex_path
    global tex_dict
    global openset_experiment

    exp_param = previous
    parameters = ast.literal_eval(node.get("parameters"))
    node_id = node.attrib['id']

    #Get node name
    node_name = node.get('name')

    if node.tag == "collection":
        print "Collection", exp_param.keys()

        images, classes, extract_path, read_time = \
                read_collection.main(node_name, openset_experiment, parameters,
                node_id)
        execution_time += read_time

        exp_param['images'] = images
        exp_param['classes'] = classes
        exp_param['extract_path'] = extract_path

    elif node.tag == "train_test_method":
        print "train_test_method", exp_param.keys()

        images = exp_param['images']
        classes = exp_param['classes']

        images, classes, train_test_list, train_test_time = \
                train_test.main(images, classes, experiment_folder, node_name,
                parameters, openset_experiment, node_id)
        execution_time += train_test_time

        exp_param['images'] = images
        exp_param['classes'] = classes
        exp_param['train_test_list'] = train_test_list

        exp_param['train_test_method'] = node_name
        exp_param['train_test_parameters'] = parameters

    elif node.tag == "descriptor":
        print "descriptor", exp_param.keys()

        images = exp_param['images']
        extract_path = exp_param['extract_path']
        classes_keys = exp_param['classes'].keys()

        if node_name == "bag":
            train_test_list = exp_param['train_test_list']

            images, extract_time = extract_bag.main(images, train_test_list,
                                                    extract_path,
                                                    experiment_folder,
                                                    parameters, node_id)

        elif node_name == "bovg":
            train_test_list = exp_param['train_test_list']

            images, extract_time = extract_bovg.main(images, train_test_list,
                                                     extract_path,
                                                     experiment_folder,
                                                     parameters, node_id)

        else:
            images, extract_time = extract_features.main(
                images, classes_keys, extract_path, node_name, parameters,
                node_id)

        execution_time += extract_time

        exp_param['images'] = images
        exp_param['descriptor'] = node_name

    elif node.tag == "normalizer":
        try:
            manager = Manager()
            images = manager.dict(exp_param['images'])
            train_test_list = exp_param['train_test_list']
        except:
            print "\n\tMissing Input. Exiting."
            sys.exit(1)

        norm_fv_paths, normalize_time = normalize_features.main(
            images, train_test_list, experiment_folder, node_name, parameters,
            node_id)
        execution_time += normalize_time

        del exp_param['images']
        exp_param['fv_paths'] = norm_fv_paths

    elif node.tag == "classifier":
        try:
            classes = exp_param['classes']
            train_test_list = exp_param['train_test_list']
            descriptor = exp_param['descriptor']
            try:
                fv_paths = exp_param['fv_paths']
                del exp_param['fv_paths']
            except:
                images = exp_param['images']
                fv_paths = util.save_file_extract(images, train_test_list,
                                                  experiment_folder)
        except:
            print "\n\tMissing Input. Exiting."
            sys.exit(1)

        images, classes_list, classify_time = classify.main(
            fv_paths, classes.keys(), train_test_list, experiment_folder,
            node_name, parameters, descriptor, node_id)
        execution_time += classify_time

        exp_param['images'] = images
        exp_param['classes_list'] = classes_list

    elif node.tag == "fusion_method":
        len_exp_param = len(exp_param)
        #list with the images dictionaries, classes dictionaries, and train and
        # test set list
        list_images = []
        list_classes = []
        list_train_test = []
        extract_path = exp_param[INDEX_ZERO]['extract_path']

        for index in range(len_exp_param):
            try:
                list_images.append(exp_param[index]['images'])
            except:
                images = {}
                for fv_path in exp_param[index]['fv_paths']:
                    print "fv_path:", fv_path
                    images_new = util.read_fv_file(fv_path)
                    images = util.merge_dict(images, images_new)
                list_images.append(images)

            list_classes.append(exp_param[index]['classes'])
            #In case that it performs the fusion of collections, there is no
            # train_test_list
            try:
                list_train_test.append(exp_param[index]['train_test_list'])
            except:
                list_train_test.append(None)
        #classes_list is present only after the classification module
        try:
            classes_list = exp_param[INDEX_ZERO]['classes_list']
        except:
            classes_list = None
        try:
            train_test_method = exp_param[INDEX_ZERO]['train_test_method']
            train_test_parameters = exp_param[INDEX_ZERO][
                'train_test_parameters']
        except:
            train_test_method = None
            train_test_parameters = None

        images, classes, train_test_list, fusion_time = \
                fusion.main(list_images, list_classes, list_train_test,
                        classes_list, experiment_folder, node_name, parameters,
                        node_id)
        execution_time += fusion_time

        exp_param = {}
        exp_param['images'] = images
        exp_param['classes'] = classes
        if train_test_list is not None:
            exp_param['train_test_list'] = train_test_list
        if classes_list is not None:
            exp_param['classes_list'] = classes_list
        if train_test_method is not None:
            exp_param['train_test_method'] = train_test_method
            exp_param['train_test_parameters'] = train_test_parameters
        exp_param['descriptor'] = None
        exp_param['extract_path'] = extract_path

    elif node.tag == "evaluation_measure":
        try:
            images = exp_param['images']
            train_test_list = exp_param['train_test_list']
            classes_list = exp_param['classes_list']
        except:
            print "\n\tMissing Input. Exiting."
            sys.exit(1)

        evaluation_time, evaluation_path = evaluation.main(
            images, train_test_list, classes_list, experiment_folder,
            node_name, parameters, node_id)
        execution_time += evaluation_time

        #Dictionaries to create the tex file
        train_test_method = exp_param['train_test_method']
        train_test_parameters = str(exp_param['train_test_parameters'])

        if train_test_method not in tex_dict:
            tex_dict[train_test_method] = {}
        train_test_dict = tex_dict[train_test_method]

        if train_test_parameters not in train_test_dict:
            train_test_dict[train_test_parameters] = {}
        output_dict = train_test_dict[train_test_parameters]

        if node_name not in output_dict:
            output_dict[node_name] = []
        list_output = [evaluation_path, classes_list[0], node_id]
        if list_output not in output_dict[node_name]:
            output_dict[node_name].append(list_output)

        train_test_dict[train_test_parameters] = output_dict
        tex_dict[train_test_method] = train_test_dict

    elif node.tag == "preprocessing":
        images = exp_param['images']
        classes = exp_param['classes']

        images, classes, preprocessing_time = preprocessing.main(
            images, classes, experiment_folder, node_name, parameters, node_id)
        execution_time += preprocessing_time

        exp_param['images'] = images
        exp_param['classes'] = classes

    else:
        print "Error. Unknown Tag."
        sys.exit(1)

    return exp_param
Exemple #10
0
                         str(categories_to_include - genre_to_files.keys()))

    #https://stackoverflow.com/a/13738951/7102572
    corpus_dir = 'grc'
    if not os.path.isdir(corpus_dir):
        try:
            cmd = 'svn'
            proc = subprocess.run([
                cmd, 'export',
                'https://github.com/timgianitsos/tesserae/trunk/texts/grc'
            ])
            proc.check_returncode()
        except OSError as e:
            print(f'Your system may not have "{cmd}" installed')
            raise e
        except subprocess.CalledProcessError as e:
            raise e

    #Feature extractions
    extract_features.main(
        corpus_dir,
        'tess',

        #Exclude all files of genres not specified. Exclude composite files no matter what
        excluded_paths=composite_files |
        (set() if len(sys.argv) <= 2 else reduce(
            lambda cur_set, next_set: cur_set | next_set,
            (genre_to_files[tok] for tok in genre_to_files
             if tok not in categories_to_include), set())),
        output_file=None if len(sys.argv) <= 1 else sys.argv[1])
Exemple #11
0
import greek_features  #seemingly unused here, but this makes the environment recognize features
import extract_features
from corpus_categories import composite_files, verse_misc_files, prose_files
import os
import sys

if __name__ == '__main__':

    #Download corpus if non-existent
    corpus_dir = os.path.join('tesserae', 'texts', 'grc')
    tesserae_clone_command = 'git clone https://github.com/timgianitsos/tesserae.git'
    if not os.path.isdir(corpus_dir):
        print(RED + 'Corpus at ' + corpus_dir +
              ' does not exist - attempting to clone repository...' + RESET)
        if os.system(tesserae_clone_command) is not 0:
            raise Exception('Unable to obtain corpus for feature extraction')

    #Feature extractions
    extract_features.main(
        corpus_dir,
        'tess',

        #Exclude the following directories and files
        excluded_paths=composite_files | verse_misc_files | prose_files,

        #Output the results to a file to be processed by machine learning algorithms
        output_file=None if len(sys.argv) <= 1 else sys.argv[1])
Exemple #12
0
OUTPUT_DIR = 'gs://{}/bert/models/{}'.format(BUCKET, TASK)
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


# Download the data. 
! wget https://competitions.codalab.org/my/datasets/download/69a3e8d0-b836-48b8-8795-36a6865a1c04
! unzip 69a3e8d0-b836-48b8-8795-36a6865a1c04
! rm 69a3e8d0-b836-48b8-8795-36a6865a1c04
! mv data.tsv data/data.tsv 
! mv eval1_unlabelled.tsv data/testdata.tsv



# create a smaller dummy data. 
datafile_name = "data/data.tsv"
smalldb = pd.read_csv(datafile_name, chunksize=10000)
trainingSet, testSet = train_test_split(smalldb, test_size=0.1)
trainingSet.to_csv("data/traindata.tsv", sep='\t')
testSet.to_csv("data/validationdata.tsv", sep='\t')


## using tokenization 
!rm -rf msaic
!git clone https://vaibhavgeek:[email protected]/vaibhavgeek/msaic.git

import sys 
sys.path += ["msaic/bert-repo"]
import extract_features
extract_features.main("data/traindata.tsv" , "bert-msaic")
#mstringApp = '/Applications/MATLAB_R2016a.app/bin/matlab'
xmlName = '/home/cosc/csprh/linux/HABCODE/code/HAB/extractData/configHABunderDesk.xml'

mstringApp = 'matlab'

tree = et.parse(xmlName)
tree.find('.//testDate').text = sample_date_string
imsDir = tree.find('.//testImsDir').text
tree.write(xmlName)

imsDir = os.path.join(imsDir, sample_date_string)
modelD = os.getcwd()

os.chdir('../extractData')
# GENERATE DATACUBES FOR A BUNCH OF LAT, LON POSITIONS IN A GRID
mstring = mstringApp + ' -nosplash -r \"test_genAllH5s; quit;\"'
os.system(mstring)
os.chdir('postProcess')
# GENERATE IMAGES FROM DATA CUBES
# GENERATED LAT AND LONS latLonList.txt TEXT FILE IN imsDir
mstring = mstringApp + ' -nosplash -r \"test_cubeSequence; quit;\"'
os.system(mstring)

os.chdir(modelD)
# EXTRACT BOTTLENECK FEATURES FROM IMAGES
extract_features.main(['cnfgXMLs/NASNet11_lstm0.xml', imsDir])

# GENERATE CLASSIFICATION FROM BOTTLENECK FEATURES AND TRAINED MODEL
# GENERATED CLASSIFICATIONS ENTERED INTO classesProbs.txt TEXT FILE IN imsDir
testHAB.main(['cnfgXMLs/NASNet11_lstm0.xml', imsDir])
import tensorflow as tf
import numpy as np
import extract_features as ef

X_train,Y_train,X_test,Y_test=ef.main()
print("Feature Extraction Completed!")

nx=X_train.shape[0]
ny=Y_train.shape[0]

m_train=X_train.shape[1]
m_test=X_test.shape[1]

net_arch=[nx, 15, 10, ny]
num_layers=len(net_arch)-1

def init_params():
    parameters={}
    for i in range(num_layers):
        parameters['W'+str(i+1)]=tf.get_variable('W'+str(i+1), shape=[net_arch[i+1],net_arch[i]], initializer=tf.contrib.layers.xavier_initializer(seed = 1))
        parameters['b'+str(i+1)]=tf.get_variable('b'+str(i+1), shape=[net_arch[i+1],1], initializer=tf.contrib.layers.xavier_initializer(seed = 1))
    return parameters

def create_network(A, parameters):
    for i in range(num_layers):
        W=parameters['W'+str(i+1)]
        b=parameters['b'+str(i+1)]
        if(i<num_layers-1): A=tf.nn.relu(tf.matmul(W,A)+b)
        else: A=tf.matmul(W,A)+b
    return A
Exemple #15
0
        #    print(tensors)
        embeddings, labels, seq_len = db.find_entities(sents)
        print(labels)
        seqwrite = open('sequencesave.txt', mode='w')
        seqwrite.write(str(seq_len))
        seqwrite.close()
        #    print(embeddings)

        #   program.train_model(embeddings, labels, seq_len)

        program.train_bidirectional(embeddings, labels, seq_len)

    elif choice == '1':
        model = models.load_model("SparseEntity.model")
        sentence = input('Input: ')
        sentence = sentence.replace("\n", "")
        writef = open('predictionfile.txt', mode='w')
        writef.write(sentence)
        writef.close()
        infile = open('predictionfile.txt').read()
        print(infile)
        ef.main('predictionfile.txt')
        tensor, sl = ef.build_tensors()
        #        print(tensor)
        prediction = program.predict(model, tensor)
        print(prediction)

        prediction = prediction[0].tolist()
        print(max(prediction))
        print(prediction.index(max(prediction)))