Beispiel #1
0
def index_page(html_data):
    """ Indexes a single HTML page. """

    soup = BeautifulSoup(html_data, features='html.parser')

    # Remove scripts and styles
    for script in soup(["script", "style"]):
        script.extract()

    # Get only text from page and preprocess it
    text = soup.get_text(separator=' ')
    tokens = preprocessing.preprocess(text)
    raw_tokens = preprocessing.preprocess(text, keep_stop_words=True)

    # Unique words
    word_list = set(tokens)

    # Get frequencies and indices
    frequencies = {word: 0 for word in word_list}
    indexes = {word: [] for word in word_list}
    for i, word in enumerate(raw_tokens):
        if word in word_list:
            frequencies[word] += 1
            indexes[word].append(i)

    return word_list, frequencies, indexes
Beispiel #2
0
def process_results(results: list, html_path):
    out = []
    for res in results:
        pos = res[1]
        document = res[2]
        filepath = html_path / document

        with filepath.open() as file:
            html_data = file.read()

        soup = BeautifulSoup(html_data, features='html.parser')

        # Remove scripts and styles
        for script in soup(["script", "style"]):
            script.extract()

        # Get only text from page and preprocess it
        text = soup.get_text(separator=' ')
        tokens = preprocessing.preprocess(text, raw=True, keep_stop_words=True)
        # print(len(tokens), len(word_tokenize(text)))

        document_length = len(tokens)

        outtake = list(group_runs(list(sorted(set(chain.from_iterable([list(range(max(0, x-3), min(x+4, document_length))) for x in pos[:100]]))))))

        snippet = make_snippet(tokens, outtake)

        # print(pos)
        # print(outtake)
        # print(snippet[:250])
        out.append((res[0], res[2], snippet[:250]))

    return out
def option_percent(window_size, training_file, training_labels, num_examples,
                   percent):
    #get the generator for features and labels
    generator = preprocessing.preprocess(training_file, training_labels,
                                         window_size)

    features = []
    labels = []
    for _ in range(num_examples):
        curr = next(generator)
        #need to convert all to int
        curr_features = curr[0]
        curr_features = list(map(int, curr_features))
        features.append(curr_features)
        labels.append(int(curr[1]))

    #need lists as numpy arrays to feed into tensor
    features = np.asarray(features)
    labels = np.asarray(labels)

    #partition data into training and testing sets
    X_train, X_test, Y_train, Y_test = sk.train_test_split(features,
                                                           labels,
                                                           test_size=percent,
                                                           random_state=42)
Beispiel #4
0
    def __init__(self, tweet):
        self.tweet = tweet
        decodedText = self.tweet.text.encode('ascii', 'ignore').decode('utf-8')

        # Calculate sentiment
        self.processedText = preprocess(decodedText)
        self.sentiment = getSentiment(self.processedText)
Beispiel #5
0
    def map_sentiment_value(post):
        if 'caption' in post:
            caption = post['caption']
            preprocessed_text = preprocess(caption)
            result = getSentiment(preprocessed_text)
            post['sentiment'] = result
            post['sentiment_compound'] = result['compound']
        else:
            post['sentiment'] = ""
            post['sentiment_compound'] = 0

        return post
Beispiel #6
0
def main(raw_query, limit):
    results_considered = int(limit)
    start_time = time.time()

    # preprocess the query
    query = preprocessing.preprocess(raw_query)

    results = sqlite_search(query)

    time_elapsed = time.time() - start_time

    to_process = results[:min(results_considered, len(results))]

    html_path = Path(__file__).parent / 'data'
    output = process_results(to_process, html_path)

    return output_string(raw_query, time_elapsed, output)
Beispiel #7
0
def main(raw_query, limit):
    dirname = Path(__file__).parent
    data_dir = dirname / 'data'

    results_considered = int(limit)
    start_time = time.time()

    # preprocess the query
    query = preprocessing.preprocess(raw_query)

    results = run_search(data_dir, query)

    time_elapsed = time.time() - start_time

    to_process = results[:min(results_considered, len(results))]

    output = process_results(to_process, data_dir)

    return output_string(raw_query, time_elapsed, output)
def option_save(window_size, training_file, training_labels, num_examples,
                dest_file):
    #get the generator for features and labels
    generator = preprocessing.preprocess(training_file, training_labels,
                                         window_size)

    features = []
    labels = []
    for _ in range(num_examples):
        curr = next(generator)
        #need to convert all to int
        curr_features = curr[0]
        curr_features = list(map(int, curr_features))
        features.append(curr_features)
        labels.append(int(curr[1]))

    #need lists as numpy arrays to feed into tensor
    features = np.asarray(features)
    labels = np.asarray(labels)

    #train a model and save it to dest_file
    train.trainsave(num_examples, training_file, training_labels, dest_file)

    return (0)
Beispiel #9
0
def evaluate_mp(window_size, input_file, label_file, num_examples, in_file):
    data_size = window_size*window_size
    # tf Graph input
    x = tf.placeholder("float", [None, (data_size)])  #inputs 
    y_ = tf.placeholder("float", [None, CLASSES])   #ground-truth labels


    #make sure that topology setup will work
    layer_1_nodes = data_size
    layer_2_nodes = data_size
    assert data_size % LAYER_1_SUBGRAPHS == 0
    assert layer_1_nodes % LAYER_1_SUBGRAPHS == 0
    assert layer_2_nodes % LAYER_2_SUBGRAPHS == 0
    assert CLASSES % LAYER_2_SUBGRAPHS == 0



    #create variables to store weights and biases
    #h1, h2, b1, and b2 contain lists of variables to be used in the subconnected 
    #   layers
    #h1 and b1 create variables that each correspond to one of the subgraphs of 
    #   layer 1. There should be (LAYER_1_SUBGRAPHS) different variables created
    #   in each. Each variable should be named "h1_[#]" or "b1_[#]", where "#"
    #   is the variable number
    #h2 and b2 are the same as h1 and b1 except that they apply to the second
    #   subconnected layer
    #the out variables control the input into the fully-connected final layer 
    #   and are named "out_weights" and "out_biases"
    #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE
    weights = {
        'h1': [tf.Variable(tf.random_normal([int(data_size/LAYER_1_SUBGRAPHS), int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("h1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)],
        'h2': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_2_SUBGRAPHS), int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("h2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)],
        'out': tf.Variable(tf.random_normal([int(layer_2_nodes), int(CLASSES)]), name= "out_weights")
    }
    biases = {
        'b1': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("b1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)],
        'b2': [tf.Variable(tf.random_normal([int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("b2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)],
        'out': tf.Variable(tf.random_normal([int(CLASSES)]), name="out_biases")
    }

    #add variables to collection and initialize the saver
    for s in range(0, LAYER_1_SUBGRAPHS):
        tf.add_to_collection('vars', ("h1_"+str(s)))
        tf.add_to_collection('vars', ("b1_"+str(s)))
    for s in range(0, LAYER_2_SUBGRAPHS):
        tf.add_to_collection('vars', ("h2_"+str(s)))
        tf.add_to_collection('vars', ("b2_"+str(s)))
    tf.add_to_collection('vars', "out_weights")
    tf.add_to_collection('vars', "out_biases")
    saver = tf.train.Saver()
    
    
    # Construct model
    y = multilayer_perceptron(x, weights, biases)   #y contains the predicted outputs
                                                #which will be compared to the 
                                                #ground-truth, y_

    # Define loss and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost)

    # Initializing the variables
    init = tf.global_variables_initializer()

    #get generator for features and labels
    generator = preprocessing.preprocess(input_file, label_file, window_size)
    features = []
    labels = []
    for count, curr in enumerate(generator):
        if count >= num_examples:
            break
        curr_features = curr[0]
        curr_features = list(map(float, curr_features)) 
        curr_labels = curr[1]
        curr_labels = list(map(float, curr_labels))
        features.append(curr_features)
        labels.append(curr_labels)    
    features = np.asarray(features)
    labels = np.asarray(labels) 
    
    with tf.Session() as sess:
        #load the data from in_file
        loader = tf.train.import_meta_graph(in_file)
        loader.restore(sess, tf.train.latest_checkpoint('./'))
        
        total_error = sess.run([cost], feed_dict={x:features, y_:labels})[0]
        print("The test error was", (total_error/num_examples))        
Beispiel #10
0
def train_mp(window_size, input_file, label_file, num_examples, out_file):
    data_size = window_size*window_size
    # tf Graph input
    x = tf.placeholder("float", [None, (data_size)])  #inputs 
    y_ = tf.placeholder("float", [None, CLASSES])   #ground-truth labels


    #make sure that topology setup will work
    layer_1_nodes = data_size
    layer_2_nodes = data_size
    assert data_size % LAYER_1_SUBGRAPHS == 0
    assert layer_1_nodes % LAYER_1_SUBGRAPHS == 0
    assert layer_2_nodes % LAYER_2_SUBGRAPHS == 0
    assert CLASSES % LAYER_2_SUBGRAPHS == 0



    #create variables to store weights and biases
    #h1, h2, b1, and b2 contain lists of variables to be used in the subconnected 
    #   layers
    #h1 and b1 create variables that each correspond to one of the subgraphs of 
    #   layer 1. There should be (LAYER_1_SUBGRAPHS) different variables created
    #   in each. Each variable should be named "h1_[#]" or "b1_[#]", where "#"
    #   is the variable number
    #h2 and b2 are the same as h1 and b1 except that they apply to the second
    #   subconnected layer
    #the out variables control the input into the fully-connected final layer 
    #   and are named "out_weights" and "out_biases"
    #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE
    weights = {
        'h1': [tf.Variable(tf.random_normal([int(data_size/LAYER_1_SUBGRAPHS), int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("h1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)],
        'h2': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_2_SUBGRAPHS), int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("h2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)],
        'out': tf.Variable(tf.random_normal([int(layer_2_nodes), int(CLASSES)]), name= "out_weights")
    }
    biases = {
        'b1': [tf.Variable(tf.random_normal([int(layer_1_nodes/LAYER_1_SUBGRAPHS)]), name=("b1_"+str(s))) for s in range(0, LAYER_1_SUBGRAPHS)],
        'b2': [tf.Variable(tf.random_normal([int(layer_2_nodes/LAYER_2_SUBGRAPHS)]), name=("b2_"+str(s))) for s in range(0, LAYER_2_SUBGRAPHS)],
        'out': tf.Variable(tf.random_normal([int(CLASSES)]), name="out_biases")
    }

    #add variables to collection and initialize the saver
    for s in range(0, LAYER_1_SUBGRAPHS):
        tf.add_to_collection('vars', ("h1_"+str(s)))
        tf.add_to_collection('vars', ("b1_"+str(s)))
    for s in range(0, LAYER_2_SUBGRAPHS):
        tf.add_to_collection('vars', ("h2_"+str(s)))
        tf.add_to_collection('vars', ("b2_"+str(s)))
    tf.add_to_collection('vars', "out_weights")
    tf.add_to_collection('vars', "out_biases")
    saver = tf.train.Saver()
    
    
    # Construct model
    y = multilayer_perceptron(x, weights, biases)   #y contains the predicted outputs
                                                #which will be compared to the 
                                                #ground-truth, y_

    # Define loss and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
    optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost)

    # Initializing the variables
    init = tf.global_variables_initializer()

    #get the generator for features and labels
    generator = preprocessing.preprocess(input_file, label_file, window_size)
    features = []
    labels = []
    for count, curr in enumerate(generator):
        if count >= num_examples:
            break
        curr_features = curr[0]
        curr_features = list(map(float, curr_features)) 
        curr_labels = curr[1]
        curr_labels = list(map(float, curr_labels))
        features.append(curr_features)
        labels.append(curr_labels)    
    features = np.asarray(features)
    labels = np.asarray(labels) 
        
    # Launch the graph
    with tf.Session() as sess:
        sess.run(init)
         
        # Training cycle
        for epoch in range(ITERATIONS):
            '''avg_cost = 0.''' #removed from example code to simplify
            total_batch = int(num_examples/BATCH_SIZE)
            # Loop over all batches
            for i in range(total_batch):
                # Run optimization op (backprop) and cost op (to get loss value)
                sess.run([optimizer, cost], feed_dict={x: features, y_: labels})
                
                #removed avg_cost tracking for simplicity
                '''# Compute average loss
                avg_cost += int(c / total_batch)''' #c was collected from sess.run
                
            #removed this section from the example code for simplicity    
            '''# Display logs per epoch step
            if epoch % display_step == 0:
                print("Epoch:", '%04d' % (epoch+1), "cost=", \
                    "{:.9f}".format(avg_cost))''' 
                
        print("Optimization Finished!")
    
        #print training accuracy
        curr_loss = sess.run([cost], feed_dict={x:features, y_:labels})[0]
        print("The training error was", (curr_loss/num_examples))
        
        #output to out_file
        saver.save(sess, out_file)    
Beispiel #11
0
# -----------------------------------------------------------------------------
#
# Utils
#
# -----------------------------------------------------------------------------
# @app.route('/uploaded/<filename>')
# def uploaded_file(filename):
# 	return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
# -----------------------------------------------------------------------------
#
# Main
#
# -----------------------------------------------------------------------------

if __name__ == '__main__':
	import preprocessing.preprocessing as preprocessing
	import sys
	if len(sys.argv) > 1 and sys.argv[1] == "collectstatic":
		preprocessing._collect_static(app)
		if 'USE_S3' in app.config:
			flask_s3.create_all(app)
	else:
		# render ccss, coffeescript and shpaml in 'templates' and 'static' dirs
		preprocessing.preprocess(app, request) 
		# set FileSystemCache instead of Memcache for development
		# cache = werkzeug.contrib.cache.FileSystemCache(os.path.join(app.root_path, "cache"))
		# run application
		app.run()
# EOF
Beispiel #12
0
 def test_parsed_header(self):
     expected_columns = ['_id_fact',
                          '_index_fact',
                          '_score_fact',
                          '_type_fact',
                          'cprojectID',
                          'documentID',
                          'identifiers',
                          'post',
                          'prefix',
                          'term',
                          '_id_meta',
                          '_index_meta',
                          '_score_meta',
                          '_type_meta',
                          'abstractText',
                          'affiliation',
                          'authorIdList',
                          'authorList',
                          'authorString',
                          'chemicalList',
                          'citedByCount',
                          'commentCorrectionList',
                          'dateOfCompletion',
                          'dateOfCreation',
                          'dateOfRevision',
                          'dbCrossReferenceList',
                          'doi',
                          'electronicPublicationDate',
                          'embargoDate',
                          'epmcAuthMan',
                          'firstPublicationDate',
                          'fullTextUrlList',
                          'grantsList',
                          'hasBook',
                          'hasDbCrossReferences',
                          'hasLabsLinks',
                          'hasPDF',
                          'hasReferences',
                          'hasSuppl',
                          'hasTMAccessionNumbers',
                          'hasTextMinedTerms',
                          'id',
                          'inEPMC',
                          'inPMC',
                          'investigatorList',
                          'isOpenAccess',
                          'journalInfo',
                          'keywordList',
                          'language',
                          'license',
                          'luceneScore',
                          'meshHeadingList',
                          'pageInfo',
                          'pmcid',
                          'pmid',
                          'pubModel',
                          'pubTypeList',
                          'pubYear',
                          'source',
                          'subsetList',
                          'title',
                          'tmAccessionTypeList',
                          'sourcedict']
     testdf = preprocessing.preprocess(rawfactspath, rawmetadatapath)
     columns = list(testdf.columns.values)
     self.assertCountEqual(columns, expected_columns, "parsed columns unequal")
Beispiel #13
0
"""
    #individual_list = list that contains all gene(individual) expression values
    #individual_id_list = contains all UNIQUE GENE ID that is used for calculating BHI
    #individual_ref_id_list = contains all GENE reference id that is used for calculating PPI Interaction Score
    #all_datamatrix, all_normalized_data_matrix = preprocessed the gene expression values that is fed to FCM
    #no_of_annotated_cluster: Number of annotated cluster that we get from DAVID tool
    #annotated_cluster_list: Basically it is a list of list that contains all gene id which are belongs to annotated cluster
    #annotated_gene_list: Set of all annotated gene id
    #annotated_gene_cooccurance_matrix(n X n where n = no of total gene(individual)) : represent if any two gene are
                                                                                    belong to same annotated cluster or not
    #unique_protein_ref: Contains unique Gene refernce ID
    #interaction_score_matrix(n X n where n = number of unique protein reference): contains the interaction score of any
                                                                interaction of two proteins
"""
individual_list = []
individual_length, individual_list, individual_id_list, individual_ref_id_list = preprocess(
    '<path_to_input_dataset>')
No_of_genes = len(individual_list)
all_data_matrix, all_normalized_data_matrix = preprocess_fcm_datamatrix(
    individual_length, individual_list)
gene_id_list = tuple(open('Intermediate_Data/population_id.txt', 'r'))
no_of_annotated_cluster, annotated_cluster_list, annotated_gene_list, annotated_gene_cooccurance_matrix = AnnotatedClustering(
    individual_id_list)
unique_protein_ref, interaction_score_matrix = Confidence_Score_Matrix()
"""
Input from user
"""
chromosome_number = int(
    sys.argv[1]
)  # Enter the number of chromosome(individual) you want to generate
generation_number = int(sys.argv[2])  # Enter the maximum number of generation
 def transform(self, X, y=None, **fit_params):
     return [preprocessing.preprocess(d) for d in X]
Beispiel #15
0

def print_generation(population, generation_num):
    print("Generation:- {}".format(generation_num))


"""
    #individual_list = list that contains all gene(individual) expression values
    #individual_id_list = contains all UNIQUE GENE ID that is used for calculating BHI
    #individual_ref_id_list = contains all GENE reference id that is used for calculating PPI Interaction Score
    #all_datamatrix, all_normalized_data_matrix = preprocessed the gene expression values that is fed to FCM
    #unique_protein_ref: Contains unique Gene refernce ID
"""
individual_list = []

individual_length, individual_list, individual_id_list, individual_ref_id_list = preprocess(
    'Input_data/preprocessed_ILD.txt')
all_data_matrix, all_normalized_data_matrix = preprocess_fcm_datamatrix(
    individual_length, individual_list)
"""
Input from user
"""
chromosome_number = int(
    sys.argv[1]
)  # Enter the number of chromosome(individual) you want to generate
generation_number = int(sys.argv[2])  # Enter the maximum number of generation

zdt_definitions = ZDT3Definitions(individual_list, individual_id_list,
                                  individual_ref_id_list)
plotter = Plotter(zdt_definitions, individual_list)
problem = ZDT(zdt_definitions, all_normalized_data_matrix, chromosome_number)
evolution = Evolution(problem, generation_number, chromosome_number,
Beispiel #16
0
    X_train = pd.read_csv(f"{CACHE_FOLDER}/X_train_preprocessed.csv",
                          index_col="Unnamed: 0")
    Y_train = pd.read_csv(f"{CACHE_FOLDER}/Y_train_preprocessed.csv",
                          index_col="Unnamed: 0")

# If the dataset is not found
except:
    print("\t-> File not found, generating preprocessed datasets")
    # Load normal datasets
    X_train = pd.read_csv(f"{DATASET_FOLDER}/X_train_update.csv",
                          index_col="Unnamed: 0")
    Y_train = pd.read_csv(f"{DATASET_FOLDER}/Y_train_CVw08PX.csv",
                          index_col="Unnamed: 0")

    # preprocess datasets
    X_train, Y_train = preprocess(X_train, Y_train)

    # save preprocessed datasets
    X_train.to_csv(f"{CACHE_FOLDER}/X_train_preprocessed.csv")
    Y_train.to_csv(f"{CACHE_FOLDER}/Y_train_preprocessed.csv")

print("\t-> Done\n")

#############################
# STEP 2: sentences embedding
#############################

print("STEP 2: Preparing data for training...")

train_x, valid_x, train_y, valid_y = get_datasets_for_training(
    X_train['designation'], Y_train['prdtypecode'], "tfidf")
Beispiel #17
0
def test_mp(window_size, input_file, label_file, num_examples, in_file, layers,
            nodes, subgraphs, classes, iterations, batch_size, training_rate):

    #cast variables to correct types
    window_size = int(window_size)
    num_examples = int(num_examples)
    layers = int(layers)
    #git rid of any spaces in nodes and subgraphs so they cast correctly
    nodes = literal_eval(str(nodes).replace(' ', ''))
    subgraphs = literal_eval(str(subgraphs).replace(' ', ''))
    classes = int(classes)
    iterations = int(iterations)
    batch_size = int(batch_size)
    training_rate = float(training_rate)

    #make sure length of lists is correct
    assert (layers == len(nodes))
    assert (layers == len(subgraphs))

    #define nodes[0] as the data_size and subgraphs[0] as 1
    data_size = window_size * window_size
    nodes = [data_size] + nodes
    subgraphs = [1] + subgraphs

    #make sure that topology setup will work
    #check up to layers-1, the highest index
    for i in range(1, layers):
        assert (nodes[i - 1] % subgraphs[i] == 0)
        assert (nodes[i] % subgraphs[i] == 0)
    assert (classes % subgraphs[layers] == 0)

    data_size = window_size * window_size
    # tf Graph input
    x = tf.placeholder("float", [None, (data_size)])  #inputs
    y_ = tf.placeholder("float", [None, classes])  #ground-truth labels

    #create variables to store weights and biases
    #create an h in weights and a b in biases for each layer in the model
    #h1 and b1 create create variables that each correspond to one of the subgraphs of
    #   layer 1. There should be (subgraphs[1]) different subvariables created
    #   in each. Each subvariable should be named "h1_[#]" or "b1_[#]", where "#"
    #   is the subvariable number
    #h2 and b2 are the same as h1 and b1 except that they apply to the second
    #   subconnected layer, as are h3 and b3 for the third and so on
    #the out variables control the input into the fully-connected final layer
    #   and are named "out_weights" and "out_biases"
    #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE

    #start by initializing weights and biases with the out variables
    weights = {
        'out':
        tf.Variable(tf.random_normal([int(nodes[layers]),
                                      int(classes)]),
                    name="out_weights")
    }
    biases = {
        'out': tf.Variable(tf.random_normal([int(classes)]), name="out_biases")
    }

    #add in the h and b variables for each hidden layer
    #note: you are creating subgraphs[i] subvariables in both wieghts and biases and
    #each of these subvariables is an array of length (nodes[i-1]/subgraphs[i]) which
    #stores a connection for that subgraph.
    #the s in range(0, subgraphs[i]) is creating multiple subvariables inside of each
    #weights[weights_name] or biases[biases_name]
    #for documentation on creating each of these subvariables, see
    #   https://www.tensorflow.org/api_docs/python/tf/random_normal
    for i in range(1, layers + 1):
        weights_name = "h" + str(i)
        biases_name = "b" + str(i)
        weights[weights_name] = [
            tf.Variable(tf.random_normal([
                int((nodes[i - 1]) / subgraphs[i]),
                int(nodes[i] / subgraphs[i])
            ]),
                        name=(weights_name + "_" + str(s)))
            for s in range(0, subgraphs[i])
        ]
        biases[biases_name] = [
            tf.Variable(tf.random_normal([int((nodes[i]) / subgraphs[i])]),
                        name=(biases_name + "_" + str(s)))
            for s in range(0, subgraphs[i])
        ]

    #add variables to collection and initialize the saver
    #for each layer, add all of the subvariables
    for i in range(1, layers + 1):
        weights_name = "h" + str(i) + "_"
        biases_name = "b" + str(i) + "_"
        for s in range(subgraphs[i]):
            subweight_name = weights_name + str(s)  #each should be "h(i)_(s)"
            subbias_name = biases_name + str(s)  #each should be "b(i)_(s)"
            tf.add_to_collection('vars', subweight_name)
            tf.add_to_collection('vars', subbias_name)
    #add the out variables
    tf.add_to_collection('vars', "out_weights")
    tf.add_to_collection('vars', "out_biases")
    #initialize saver
    saver = tf.train.Saver()

    # Construct model
    y = multilayer_perceptron(x, layers, weights, biases,
                              subgraphs)  #y contains the predicted outputs
    #which will be compared to the
    #ground-truth, y_

    # Define loss and optimizer
    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
    optimizer = tf.train.AdamOptimizer(
        learning_rate=training_rate).minimize(cost)

    #get the generator for features and labels
    generator = preprocessing.preprocess(input_file, label_file, window_size)
    features = []
    labels = []
    for count, curr in enumerate(generator):
        if count >= num_examples:
            break
        curr_features = curr[0]
        curr_features = list(map(float, curr_features))
        curr_labels = curr[1]
        curr_labels = list(map(float, curr_labels))
        features.append(curr_features)
        labels.append(curr_labels)
    features = np.asarray(features)
    labels = np.asarray(labels)

    # Launch the graph
    with tf.Session() as sess:
        #load the data from in_file
        #NOTE: IF RUNNING ON A DIFFERENT MACHINE THAN IT WAS TRAINED ON, ADD
        #"clear_devices=true" into the arguments for import_meta_graph
        loader = tf.train.import_meta_graph(in_file)
        loader.restore(sess, tf.train.latest_checkpoint('./'))

        total_error = sess.run([cost], feed_dict={x: features, y_: labels})[0]
        print("The test error was", (total_error / num_examples))
Beispiel #18
0
def process(dataset_name, out_folder, train_size, one_document_per_folder, force, args, concat_train_instances, shuffle, preprocess):
    out_folder = os.path.join(out_folder, dataset_name)

    if not force and os.path.isdir(out_folder):
        print('Outfolder existing! Aborting ({})'.format(out_folder))
        sys.exit(1)

    X, Y = dataset_helper.get_dataset(dataset_name)

    print('#Docs: {}'.format(len(X)))

    if preprocess:
        X = [preprocessing.preprocess(x) for x in X]

    if shuffle:
        data_train_X, data_test_X, data_train_Y, data_test_Y = dataset_helper.split_dataset(X, Y, train_size=train_size)
    else:
        data_train_X, data_test_X, data_train_Y, data_test_Y = X, [], Y, []

    if train_size == 1.0:
        sets = [
            ('all', data_train_X, data_train_Y)
        ]
    else:
        sets = [
            ('train', data_train_X, data_train_Y),
            ('test', data_test_X, data_test_Y)
        ]

    # Create folder
    os.makedirs(out_folder, exist_ok=True)
    all_topic_counts = defaultdict(int)
    for set_name, X, Y in sets:
        topic_id_counters = defaultdict(int)
        set_folder = os.path.join(out_folder, set_name)
        assert len(X) == len(Y)

        for x, y in zip(X, Y):
            # Create set folder if not one_document_per_folder
            if one_document_per_folder:
                folder = set_folder
            else:
                folder = os.path.join(set_folder, str(y))
            os.makedirs(folder, exist_ok=True)

            doc_id = str(topic_id_counters[y]).zfill(4)

            if concat_train_instances and set_name == 'train':
                filename = '{}/{}/{}.txt'.format(folder, y, doc_id)
            elif one_document_per_folder:
                filename = '{}/{}_{}/{}.txt'.format(folder, y, doc_id, '0')
            os.makedirs(os.path.join(*filename.split('/')[:-1]), exist_ok=True)

            with codecs.open(filename, 'w') as f:
                f.write(x)
            all_topic_counts[y] += 1
            topic_id_counters[y] += 1

    with open(os.path.join(out_folder, 'stats.json'), 'w') as f:
        json.dump({
            'total_docs': sum(all_topic_counts.values()),
            'categories': list(set(Y)),
            'topic_counts': all_topic_counts,
            'set_counts': {name: len(X) for name, X, Y in sets},
            'params': args,
            'timestamp': time_utils.get_time_formatted(),
            'unix_timestamp': time_utils.get_timestamp(),
            'git_commit': str(git_utils.get_current_commit())
        }, f, indent=4, sort_keys=True)
Beispiel #19
0
# -----------------------------------------------------------------------------
#
# Utils
#
# -----------------------------------------------------------------------------
# @app.route('/uploaded/<filename>')
# def uploaded_file(filename):
# 	return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
# -----------------------------------------------------------------------------
#
# Main
#
# -----------------------------------------------------------------------------

if __name__ == '__main__':
    import preprocessing.preprocessing as preprocessing
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == "collectstatic":
        preprocessing._collect_static(app)
        if app.config['USE_S3']:
            flask_s3.create_all(app)
    else:
        # render ccss, coffeescript and shpaml in 'templates' and 'static' dirs
        preprocessing.preprocess(app, request)
        # set FileSystemCache instead of Memcache for development
        # cache = werkzeug.contrib.cache.FileSystemCache(os.path.join(app.root_path, "cache"))
        # run application
        app.run()

# EOF
def train_mp(window_size, input_file, label_file, num_examples, out_file,
             layers, nodes, subgraphs, classes, iterations, batch_size,
             training_rate):

    window_size = int(window_size)
    #print("window_Size=", window_size)
    #print("input_file=", input_file)
    #print("label_file", label_file)
    num_examples = int(num_examples)
    #print("num_examples=", num_examples)
    #print("out_file=",out_file)
    layers = int(layers)
    #print("layers=", layers)
    nodes = list(map(int, nodes))
    #print("nodes=", nodes)
    subgraphs = list(map(int, subgraphs))
    #print("subgraphs=", subgraphs)
    classes = int(classes)
    #print("classes=", classes)
    iterations = int(iterations)
    #print("iterations=",iterations)
    batch_size = int(batch_size)
    #print("batch size=",batch_size)
    training_rate = float(training_rate)
    #print("training_rate=",training_rate)

    #make sure length of lists is correct
    assert (layers == len(nodes))
    assert (layers == len(subgraphs))

    #define nodes[0] as the data_size and subgraphs[0] as 1
    data_size = window_size * window_size
    print(data_size)
    nodes = [data_size] + nodes
    subgraphs = [1] + subgraphs

    #make sure that topology setup will work
    #check up to layers-1, the highest index
    for i in range(1, layers):
        assert (nodes[i - 1] % subgraphs[i] == 0)
        assert (nodes[i] % subgraphs[i] == 0)
    assert (classes % subgraphs[layers] == 0)

    #record time for training to run
    start_time = time.time()

    data_size = window_size * window_size
    # tf Graph input
    x = tf.placeholder("float", [None, (data_size)])  #inputs
    y_ = tf.placeholder("float", [None, classes])  #ground-truth labels

    #create variables to store weights and biases
    #create an h in weights and a b in biases for each layer in the model
    #h1 and b1 create create variables that each correspond to one of the subgraphs of
    #   layer 1. There should be (subgraphs[1]) different subvariables created
    #   in each. Each subvariable should be named "h1_[#]" or "b1_[#]", where "#"
    #   is the subvariable number
    #h2 and b2 are the same as h1 and b1 except that they apply to the second
    #   subconnected layer, as are h3 and b3 for the third and so on
    #the out variables control the input into the fully-connected final layer
    #   and are named "out_weights" and "out_biases"
    #NOTE: THE NAMES ARE NECESSARY TO SAVE THE MODEL TO A FILE

    #start by initializing weights and biases with the out variables
    weights = {
        'out':
        tf.Variable(tf.random_normal([int(nodes[layers]),
                                      int(classes)]),
                    name="out_weights")
    }
    biases = {
        'out': tf.Variable(tf.random_normal([int(classes)]), name="out_biases")
    }

    #add in the h and b variables for each hidden layer
    #note: you are creating subgraphs[i] subvariables in both wieghts and biases and
    #each of these subvariables is an array of length (nodes[i-1]/subgraphs[i]) which
    #stores a connection for that subgraph.
    #the s in range(0, subgraphs[i]) is creating multiple subvariables inside of each
    #weights[weights_name] or biases[biases_name]
    #for documentation on creating each of these subvariables, see
    #   https://www.tensorflow.org/api_docs/python/tf/random_normal
    for i in range(1, layers + 1):
        weights_name = "h" + str(i)
        biases_name = "b" + str(i)
        weights[weights_name] = [
            tf.Variable(tf.random_normal([
                int((nodes[i - 1]) / subgraphs[i]),
                int(nodes[i] / subgraphs[i])
            ]),
                        name=(weights_name + "_" + str(s)))
            for s in range(0, subgraphs[i])
        ]
        biases[biases_name] = [
            tf.Variable(tf.random_normal([int((nodes[i]) / subgraphs[i])]),
                        name=(biases_name + "_" + str(s)))
            for s in range(0, subgraphs[i])
        ]

    #add variables to collection and initialize the saver
    #for each layer, add all of the subvariables
    for i in range(1, layers + 1):
        weights_name = "h" + str(i) + "_"
        biases_name = "b" + str(i) + "_"
        for s in range(subgraphs[i]):
            subweight_name = weights_name + str(s)  #each should be "h(i)_(s)"
            subbias_name = biases_name + str(s)  #each should be "b(i)_(s)"
            tf.add_to_collection('vars', subweight_name)
            tf.add_to_collection('vars', subbias_name)
    #add the out variables
    tf.add_to_collection('vars', "out_weights")
    tf.add_to_collection('vars', "out_biases")
    #initialize saver
    saver = tf.train.Saver()

    # Construct model
    y = multilayer_perceptron(x, layers, weights, biases,
                              subgraphs)  #y contains the predicted outputs
    #which will be compared to the
    #ground-truth, y_

    # Define loss and optimizer
    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
    optimizer = tf.train.AdamOptimizer(
        learning_rate=training_rate).minimize(cost)

    # Initializing the variables
    init = tf.global_variables_initializer()

    #get the generator for features and labels
    generator = preprocessing.preprocess(input_file, label_file, window_size)
    features = []
    labels = []
    for count, curr in enumerate(generator):
        if count >= num_examples:
            break
        curr_features = curr[0]
        curr_features = list(map(float, curr_features))
        curr_labels = curr[1]
        curr_labels = list(map(float, curr_labels))
        features.append(curr_features)
        labels.append(curr_labels)
    features = np.asarray(features)
    labels = np.asarray(labels)

    # Launch the graph
    with tf.Session() as sess:
        sess.run(init)

        # Training cycle
        for epoch in range(iterations):
            '''avg_cost = 0.'''  #removed from example code to simplify
            total_batch = int(num_examples / batch_size)
            # Loop over all batches
            for i in range(total_batch):
                # Run optimization op (backprop) and cost op (to get loss value)
                sess.run([optimizer, cost],
                         feed_dict={
                             x: features,
                             y_: labels
                         })

                #removed avg_cost tracking for simplicity
                '''# Compute average loss
                avg_cost += int(c / total_batch)''' #c was collected from sess.run

            #removed this section from the example code for simplicity
            '''# Display logs per epoch step
            if epoch % display_step == 0:
                print("Epoch:", '%04d' % (epoch+1), "cost=", \
                    "{:.9f}".format(avg_cost))'''

        print("Optimization Finished!")

        end_time = time.time()

        #print training accuracy and training time
        curr_loss = sess.run([cost], feed_dict={x: features, y_: labels})[0]
        print("The training error was", (curr_loss / num_examples))
        print("Optimization took %s seconds" % (end_time - start_time))

        #output to out_file
        saver.save(sess, out_file)
Beispiel #21
0
    centers = [
        individual_features[i:(i + individual_length)]
        for i in range(0, len(individual_features), individual_length)
    ]
    distance_list = []
    for a, b in itertools.combinations(centers, 2):
        d1 = distance.euclidean(a, b)
        distance_list.append(d1)
    Dc = max(distance_list)
    Ec = np.sum(individual.partition_matrix * individual.distance_matrix)
    PBM_index = math.pow((Dc / (individual.no_of_Cluster * Ec)), 2)
    return PBM_index


individual_list = []
individual_length, individual_list = preprocess(
    'Input_data/preprocessed_BCLL.txt')
data_matrix_trans = preprocess_fcm_datamatrix(individual_length,
                                              individual_list)
individual_no = len(individual_list)
data_matrix = np.array(individual_list)
print("##", data_matrix.shape)
print(data_matrix_trans.shape)
"""
Input from user
"""
chromosome_number = int(
    sys.argv[1]
)  # Enter the number of chromosome(individual) you want to generate
generation_number = int(sys.argv[2])

#problem = Problem(num_of_variables=3, objectives=[f1, f2], variables_range=[(-5, 5)], same_range=True, expand=False)