def main(argv): id_list, fasta_list, config_file = '', '', '' # Setting the get options method to read the input arguments try: opts, args = getopt.getopt(argv, "f:i:c:h", ["fasta=", "id=", "config=", "help"]) except getopt.GetoptError: print 'Invalid arguments:\nUsage:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-help]' sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print "USAGE:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-h]\n\t-i\t\tText file containing protein OMA ids (1 id per line)\n\t-f\t\tList of input protein sequences in fasta format\n\t-c\t\tConfiguration file for setting program's dependencies" sys.exit(2) elif opt in ('-i', '--id'): id_list = arg elif opt in ('-f','--fasta'): fasta_list = arg elif opt in ('-c','--config'): config_file = arg else: print 'Invalid arguments:\nUsage:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-help]' sys.exit(2) config_file = os.path.abspath(config_file) # Calling the class in configure.py module and setting the tool parameters proteinParams = configure.setParams(config_file) if id_list != '': for ids in open(id_list): print '##### Running for OMA id: %s #####' %ids.split()[0] if proteinParams.preprocessing: preprocessing.Preprocessing(ids.split()[0], 'None', config_file) if proteinParams.traceability_calculation: traceabilityCalculation.main(ids.split()[0], config_file) if proteinParams.mapTraceabilitySpeciesTree: mapToSpeciesTree.main(ids.split()[0], config_file) elif fasta_list != '': with open(fasta_list) as fa: for seqs in fa: if '>' in seqs: print '##### Running for fasta id: %s #####' %seqs[1:-1] inputId = seqs.split()[0][1:] querySeq = fa.next() if proteinParams.preprocessing: preprocessing.Preprocessing(inputId, querySeq, config_file) if proteinParams.traceability_calculation: traceabilityCalculation.main(inputId, config_file) if proteinParams.mapTraceabilitySpeciesTree: mapToSpeciesTree.main(inputId, config_file)
def get_all_context_4_delta_idf_scores(self, pos_revs, neg_revs): """ This helper method generates four scores on a word-basis using positive and negative reviews. :param pos_revs: Positive reviews. :type pos_revs: list :param neg_revs: Negative reviews. :type neg_revs: list :return: A dictionary containing words as keys and four sentiment scores for each word as values. :rtype: dict """ all_revs = self.get_all_revs(pos_revs, neg_revs) all_context_words = preprocessing.Preprocessing( ).get_all_context_words(all_revs=all_revs) context_delta_idf_scores = self.extract_all_context_delta_idf_scores( all_context_words, self.delta_idf_scores) all_context_4_delta_idf_scores = {} for target_word, word_context_delta_idf_scores in context_delta_idf_scores.items( ): all_context_4_delta_idf_scores[target_word] = \ self.get_context_4_delta_idf_scores(target_word, word_context_delta_idf_scores, self.delta_idf_scores) return all_context_4_delta_idf_scores
def set_traning_data(self, img_path, txt_path): if op.isfile("./model/traningData.pkl"): print("already exist training file") return print 'make training data...' pf = preprocessing.Preprocessing() pf.set_img_path(img_path) pf.set_txt_path(txt_path) tag_list = pf.get_tag_list() trX = [] trY = [] print 'find coordinate...' count = 0 total = len(tag_list) for temp in tag_list: lm = landmark.Landmark(img_path + '/' + temp[0] + '.jpg') lm.get_face_landmarks() if lm.coordinate is None: continue trX.append(lm.coordinate.reshape((1, 30))[0]) trY.append(FTYPE[temp[1]]) count += 1 sys.stdout.write('\r%d%%' % int(float(count) / float(total) * 100)) sys.stdout.flush() print '\nTotal', count, 'data,', total - count, 'data missing' trainingItem = np.array(trX), np.array(trY) pickle.dump(trainingItem, open("./model/traningData.pkl", "wb")) print 'create traningData.pkl file'
def load_test_char_data(self, test_file): Preprocessing = preprocessing.Preprocessing(self.train_file, self.max_word_length) Preprocessing.generate_char_one_hot_vec_and_num_encoding_for_file( test_file, self.divide_file_factor) test_file_substring = test_file + "_char_num_encoded" test_char_data = self.read_parts_from_file(test_file_substring, self.max_word_length) return test_char_data
def load_global_word_vectors(self, file, vec_file): Preprocessing = preprocessing.Preprocessing(self.train_file, self.max_word_length) length_of_vectors = Preprocessing.process_global_word_vectors( file, vec_file, self.divide_file_factor) word_vector_file_substring = file + '_word_vectors_from_' + ntpath.basename( vec_file) word_vectors = self.read_parts_from_file(word_vector_file_substring, length_of_vectors) return word_vectors
def load_train_char_data(self): Preprocessing = preprocessing.Preprocessing(self.train_file, self.max_word_length) Preprocessing.build_char_dic() Preprocessing.generate_char_one_hot_vec_and_num_encoding_for_file( self.train_file, self.divide_file_factor) train_file_substring2 = str(self.train_file) + '_char_num_encoded' train_char_data = self.read_parts_from_file(train_file_substring2, self.max_word_length) return train_char_data
def __init__(self): self.tweets = [] self.preprocessor = Preprocessing.Preprocessing() #Stemmer also stems the query content stop_words = self.preprocessor.stops() analyzer = StemmingAnalyzer(stoplist=stop_words) analyzer.cachesize = -1 # Unbounded caching, but worse memory performance file("results.txt", "w") file("topResults.txt", "w") file("term_stats.txt", "w") self.schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True, analyzer=analyzer))
def move(self): # This function is called on every turn of a game. It's how your snake decides where to move. # Valid moves are "up", "down", "left", or "right". # TODO: Use the information in cherrypy.request.json to decide your next move. data = cherrypy.request.json info = ppc.Preprocessing(data["board"], data["you"]) # Choose a random direction to move in possible_moves = ["up", "down", "left", "right"] move = random.choice(possible_moves) print(f"MOVE: {move}") return {"move": move}
def main(): """ The main function. :return: This main function builds the model and performs the evaluation. :rtype: None """ parser_ = create_parser() args = parser_.parse_args() constants.COMMAND = args.command constants.LANG = args.language constants.EMBEDDING_TYPE = args.embedding_type constants.EMBEDDING_SIZE = args.embedding_size constants.CV_NUMBER = args.cv_number constants.USE_3_REV_POL_SCORES = args.use_3_review_polarities constants.DATASET_PATH = args.file_path constants.TRAINING_FILE = args.training_path constants.TEST_FILE = args.test_path constants.MODEL_FILE_NAME = args.model_path # Cross-validation to be performed on a single dataset. if args.command == "cross_validate": svm.run_cross_validation_svm(constants.DATASET_PATH) # Training and test datasets are provided separately. elif args.command == "train_and_test_separately": training_file = constants.TRAINING_FILE_PATH test_file = constants.TEST_FILE_PATH svm.train_and_test_separate_files(training_file, test_file) # Only the label of a single review to be typed in the terminal is predicted. elif args.command == "predict": pre = preprocessing.Preprocessing() reviews, labels = pre.get_data(constants.DATASET_PATH) model, sf, tr_vecs, imp = svm.generate_model(reviews, labels) # The following command could also be used instead of the above two commands. # (model, sf, tr_vecs, imp) = pickle.load(open(constants.MODEL_FILE_NAME, "rb")) print("Please, enter a text below:") line = sys.stdin.readline() while line: line = line.strip("\n") line = pre.preprocess_one_line(line) sentiment = svm.test_model(model, sf, tr_vecs, imp, line)[0][0] sentiment = "Positive" if sentiment == "P" else "Negative" print(sentiment) print("Please, enter a text below:") line = sys.stdin.readline()
def prom_cnn(): prep = preprocessing.Preprocessing(r'dict', special_token='<UNK>') final_embeddings = [] with open('./matrix/embeddings.txt', 'r', encoding='utf8') as f: emb_lines = f.readlines() for i in range(len(emb_lines)): final_embeddings.append([float(num) for num in emb_lines[i].replace('\n', '').split('\t')[1:]]) final_embeddings = get_emb_mat(final_embeddings) _, tag_vocab = prepare_data(r'./Data/nomenklatura.csv', columnname=['FullName', 'Count']) index_to_count = {0: '10,01', 1: '10,02', 2: '10,05', 3: '10,09', 4: '10,06', 5: '10,08', 6: '10,12', 7: '10.10', 8: '10.11.01', 9: '10.03', 10: '10.11.02', 11: '10,07', 12: '10,04'} cnn = CNN(n_tags=len(tag_vocab), emb_mat=final_embeddings, n_hidden_list=[100, 100, 100]) # batch = [[prep.word_to_index[i] for i in prep.prepare_data(elem)]] # y_pred = index_to_count[cnn.predict(tok=batch)[0]] return cnn, prep, index_to_count
def move(self): # This function is called on every turn of a game. It's how your snake decides where to move. # Valid moves are "up", "down", "left", or "right". # TODO: Use the information in cherrypy.request.json to decide your next move. data = cherrypy.request.json info = ppc.Preprocessing(data["board"], data["you"]) # Choose a random direction to move in possible_moves = ["up", "down", "left", "right"] info.get_weights() move, shortest_weight, path = info.get_shortest_path(6) print("move:", move) print("smallest weight:", shortest_weight) print("path:", path) return {"move": move}
def run(self): if self.conf['preprocessing']['enable'] != False: os.mkdir(os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing') preprocessing_obj = preprocessing.Preprocessing(self.result_dir, self.conf, input_file=self.input_file) if self.conf['preprocessing']['fastqc']['enable'] != False: preprocessing_obj.fastqc_single_end('.') temp_file = open(os.path.abspath('.') + '/' + self.result_dir+'/Summary_of_results.html', 'a+') temp_file.write('<ul>\n') temp_file.write('<li>preprocessing result is in %s</li>\n' % './preprocessing') temp_file.write('<li><a href="%s">click to report</a></li>\n' % ('./preprocessing/')) temp_file.write('</ul>\n') temp_file.close() if self.conf['identification']['enable'] != False: os.mkdir(os.path.abspath('.') + '/' + self.result_dir + '/' + 'identification') resequencing_obj = annotation.Resequencing(self.result_dir, self.conf, self.input_file) resequencing_obj.run_3gs()
def test_preprocess_one_image(self): p = preprocessing.Preprocessing() p.load_files() for i in [0, 500, 900, 2000, 3000, 4000, 5000, 6000, 7000]: image = p.x[i] io.imsave(os.path.join('tmp', 'image{}.png'.format(i)), image) processed_image = p.preprocess_image(image) io.imsave(os.path.join('tmp', 'image{}_after.png'.format(i)), processed_image) variations = p.create_variations(processed_image) for j in range(len(variations)): io.imsave( os.path.join('tmp', 'image{0}_after_variation{1}.png'.format(i, j)), variations[j] )
def test_irisDataOutput(self): with open('iris.csv', 'r') as datafile: reader = csv.DictReader(datafile, delimiter=';') data = [] for line in reader: data.append(line) pre = preprocessing.Preprocessing(data) normalizedData = pre.normalizeData() dbscanner = DBSCAN() data = dbscanner.cluster( normalizedData, 0.065, 4, 'eucl', ['Case', 'class', 'sepal_width', 'sepal_length']) writer = output.ClusterImageWriter('iris.csv', 'output') writer.writeDBSCANImage(data, 'cluster', 'petal_width', 'petal_length', 4, 0.07)
def get_eng_dict_defs(dict_path): """ This function generates dictionary definitions for English. :param dict_path: The path to the SentiWordNet lexicon. :type dict_path: str :return: English dictionary :rtype: dict """ dict_defs = {} eng_dict_word_freqs = Counter() pre = preprocessing.Preprocessing() with open(dict_path, "r") as d: for line in d: if line[0] == "#" or line[0].isspace(): continue line = line.lower() line_spl = line.split('\t') synsets = line_spl[4].split() synsets = [synset[:-2] for synset in synsets if synset[-1] == "1" ] # The main meaning/synset is captured. gloss = line_spl[5] # The dictionary definition of the entry word. gloss_toks = pre.english_tokenize(gloss) for gloss_tok in gloss_toks: eng_dict_word_freqs[gloss_tok] += 1 for synset in synsets: if synset not in dict_defs: dict_defs[synset] = [] dict_defs[synset].extend(gloss_toks) dict_defs[synset].append(synset) dict_defs = lexical_interface.get_dict_except_most_and_least_freq( eng_dict_word_freqs, dict_defs) return dict_defs
def main(): with open("iris.csv", 'r') as datafile: reader = csv.DictReader(datafile, delimiter=';') data = [] for line in reader: data.append(line) pre = preprocessing.Preprocessing(data) normalized_data = pre.normalizeData() k_means = kMeans() data_kmeans = k_means.cluster(normalized_data, k=3, dist='eucl', centreMethod='rand', filterKeys=['Case', 'class']) dbscan = DBSCAN() data_dbscan = dbscan.cluster(normalized_data, eps=0.3, MinPts=2, dist='eucl', filterKeys=['Case', 'class']) #printData(data_dbscan) printData(data_kmeans) pp = pprint.PrettyPrinter(indent=2)
def __init__(self, dataset_path='', model_id=None, dataset_id=None, train_params=None, workers=cpu_count): self.workers = workers self.pp = preprocessing.Preprocessing() self.dataset_id = dataset_id self.model = None if model_id: self.model_id = str(model_id) print('Model {} loaded'.format(self.model_id)) self.model = self.load_model_from_disk() self.model_vocab = set(self.model.wv.vocab.keys()) self.dataset_interface = DatasetInterface(self.model_id) if self.dataset_id: self.dataset_interface = DatasetInterface(self.dataset_id) self.train_data = self.prepare_doc2vec_train_data(self.dataset_interface.dataset) # self.train_data = self.prepare_doc2vec_train_data_int(self.dataset_interface.dataset) self.init_and_train(**train_params) else: print('No params')
def test_irisDataOutput(self): with open('iris.csv', 'r') as datafile: reader = csv.DictReader(datafile, delimiter=';') data = [] for line in reader: data.append(line) pre = preprocessing.Preprocessing(data) normalizedData = pre.normalizeData() k_means = kMeans() k_means.cluster(normalizedData, k=3, dist='eucl', centreMethod='rand', filterKeys=['Case', 'class']) writer = output.ClusterImageWriter('iris.csv', 'output') writer.writeKMeansImages(k_means.iterData, k_means.iterCentres, 'cluster', 'dist2clu', 'petal_width', 'petal_length') writer.writeGif()
def extract_sample(file_, size=60, sex=False, out_size=9): # Preprocess file ... x = prep.Preprocessing(file_, size) x.start_point_detection(threshold=0.5, n=10) x.cut_first_max(n=20) x.normalize() x.fit() x.get_subset('static') num = int(re.search(r'(?=.*)[0-9](?=.*)', file_).group(0)) # make a column of the whole array features = x.data.reshape((len(x.data) * len(x.data[0]), 1)) if sex: if re.search(r'.*woman.*', file_): labels = vectorize_output(num - 1 + out_size, shape=(out_size * 2, 1)) else: labels = vectorize_output(num - 1, shape=(out_size * 2, 1)) else: labels = vectorize_output(num - 1, shape=(out_size, 1)) return (features, labels)
os.remove(self.model_path + "checkpoint") return if len(batch) > remain: for bat in sorted(batch)[:-(remain)]: for file in filelists: if str(bat) in file and "chatbot_seq2seq" in file: os.remove(self.model_path + file) except Exception as e: return if __name__ == '__main__': seq = Seq2seq() if sys.argv[1]: if sys.argv[1] == 'server': pp = preprocessing.Preprocessing() print("server run.. ") soc = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Create a socket object host = "127.0.0.1" # Get local machine name port = 1994 # Reserve a port for your service. soc.bind((host, port)) # Bind to the port soc.listen(5) # Now wait for client connection. while True: conn, addr = soc.accept() # Establish connection with client. print("Got connection from", addr) question = conn.recv(1024) relation = pp.term_relationship(question.decode("utf-8"), 10) answer = seq.predict( relation if relation != "" else str(question)) print(answer)
"y": 55 }, "EM": { "x": 85, "y": 85 }, "TNBC": { "x": 20, "y": 20 }, "ssTEM": { "x": 32, "y": 32 }, } print("----Starting Data Download-----") preprocess = preprocessing.Preprocessing(selections=selections, datasets=datasets, k_shots=k_shots, target_dir=target_dir) subprocess.call(["sh", "./downloadUnzipDatasets.sh"]) print("-----Finished Data Download-----") extractFewShotTargetSelections.extractFewShotTargetSelection() print("-----Preprocessing Few-Shot Target Selections-----") preprocess.reprocessFTandTestSamples(crop_steps_dataset=ft_crop_steps_dataset, remove_black_images=True) print("-----Preprocessing Source Datasets-----") preprocess.preprocess_Source_Data(crop_steps_dataset=source_crop_steps_dataset, remove_black_images=True)
- perform the training of model chosen, - save the models, - perform a prediction and save it in a file in format csv, - evaluate the prediction with the test set provided (blind): - compute and plot the confusion matrix, - print the report of classification, - compute and print the accuracy of models, #################################################################################################### ''' ''' Preprocessing: In this section the preprocessing is performed: - obtain the features with two kind of preprocessing adopted: unigrams and bigrams with occurence, - the models are instanced with different kind of algorithm availabled in class 'Model' ''' Preprocessing_data = preprocessing.Preprocessing( ) #instance of preprocessing data Preprocessing_prediction = preprocessing.PreprocessingPrediction( ) #instance of preprocessing for prediciton phase kind_preprocessing_unigrams = Preprocessing_data.unigrams #kind of preprocessing unigrams kind_preprocessing_bigrams = Preprocessing_data.bigrams #kind of preprocessing bigrams kind_label_opt = Preprocessing_data.opt #kind of label used for binary class: opt kind_label_compiler = Preprocessing_data.compiler #kind of label used for multi-class class: compiler ''' The choice of Algorithm: In oder to use the algorithm in main is need to instance the desidered algorithm from the following list: - Naive Bayes Multinomial algorithm => NaiveBayesMultinomial() - Naive Bayes Bernoulli algorithm => NaiveBayesMBernoulli() - SVC algorithm with 'rbf' as kernel => RbfSVC() - SVC algorithm with 'Linear' as kernel => LinearSVC()
def setUp(self): self.p = preprocessing.Preprocessing() self.p.load_files() self.c = Classifier('extra_trees')
def clustering(self): # check input validity if not self.filename: self.error_text.config(text="Choose a file first") return if self.algorithm.get() == 'kmeans' and not is_int(self.k_entry.get()): self.error_text.config(text="Select k") return if self.algorithm.get() == 'dbscan' and not is_int( self.min_pts_entry.get()) and not is_float( self.eps_entry.get()): self.error_text.config(text="Select MinPts and Eps") return self.error_text.config(text="") filteredKeys = [] selectedKeys = [] i = 0 while i < len(self.boxes): if self.boxes[i].get() == 0: filteredKeys.append(self.attributes[i]) else: selectedKeys.append(self.attributes[i]) i += 1 pre = preprocessing.Preprocessing(self.data) #pre.removeAttributes(["Att1", "Att3", "For example"]) normalized_data = pre.normalizeData() clustered_data = None k_means = kMeans() dbscan = DBSCAN() if self.algorithm.get() == 'kmeans': clustered_data = k_means.cluster(normalized_data, k=self.k_entry.get(), dist=self.dist.get(), centreMethod='rand', filterKeys=filteredKeys) else: clustered_data = dbscan.cluster(normalized_data, eps=float(self.eps_entry.get()), MinPts=int( self.min_pts_entry.get()), dist=self.dist.get(), filterKeys=filteredKeys) with open(self.algorithm.get() + ".csv", "w") as outfile: keysExist = False csvwriter = csv.writer(outfile, delimiter=';') for line in clustered_data: keys = [] values = [] for key, value in line.items(): keys.append(key) values.append(value) if not keysExist: csvwriter.writerow(keys) keysExist = True csvwriter.writerow(values) #remove old gif if self.updater is not None: self.master.after_cancel(self.updater) self.updater = None # Create imagefile and show image in UI imagefile = self.algorithm.get() clustWriter = ClusterImageWriter( imagefile, datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S')) picname = "" gifname = "" if self.algorithm.get() == 'kmeans': names = clustWriter.writeKMeansImages(k_means.iterData, k_means.iterCentres, 'cluster', 'dist2clu', selectedKeys[0], selectedKeys[1]) gifname = clustWriter.writeGif() picname = names.pop() else: picname = clustWriter.writeDBSCANImage(clustered_data, 'cluster', selectedKeys[0], selectedKeys[1], 4, 0.07) gifname = clustWriter.writeGif() pic = PhotoImage(file=picname) set_image(self.canvas, pic) if self.updater is None: self.updater = self.master.after(0, self.update_gif, self.gifcanvas, get_frames(gifname), 0)
def run_single(self): if self.conf['preprocessing']['enable'] != False: os.mkdir( os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing') preprocessing_obj = preprocessing.Preprocessing( self.result_dir, self.conf, input_file=self.input_file) if self.conf['preprocessing']['fastp']['enable'] != False: preprocessing_obj.fastp_single_end() self.preprocessing_output = self.result_dir + '/preprocessing/fastp_output.fastq' else: if self.conf['preprocessing']['fastqc']['enable'] != False: os.mkdir( os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing/fastqc_before_filtering') preprocessing_obj.fastqc_single_end( 'fastqc_before_filtering') if self.conf['preprocessing']['trimmomatic']['enable'] != False: preprocessing_obj.trimmomatic_single_end() self.preprocessing_output = self.result_dir + '/preprocessing/trimmomatic_output.fastq' if self.conf['preprocessing']['fastqc']['enable'] != False: preprocessing_obj.input_file = self.preprocessing_output os.mkdir( os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing/fastqc_after_filtering') preprocessing_obj.fastqc_single_end( 'fastqc_after_filtering') elif self.conf['preprocessing']['cutadapt']['enable'] != False: preprocessing_obj.cutadapt_single_end() self.preprocessing_output = self.result_dir + '/preprocessing/cutadapt_output.fastq' if self.conf['preprocessing']['fastqc']['enable'] != False: preprocessing_obj.input_file = self.preprocessing_output os.mkdir( os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing/fastqc_after_filtering') preprocessing_obj.fastqc_single_end( 'fastqc_after_filtering') elif self.conf['preprocessing']['sickle']['enable'] != False: preprocessing_obj.sickle_single_end() self.preprocessing_output = self.result_dir + '/preprocessing/sickle_output.fastq' if self.conf['preprocessing']['fastqc']['enable'] != False: preprocessing_obj.input_file = self.preprocessing_output os.mkdir( os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing/fastqc_after_filtering') preprocessing_obj.fastqc_single_end( 'fastqc_after_filtering') elif self.conf['preprocessing']['SOAPnuke']['enable'] != False: preprocessing_obj.soapnuke_single_end() self.preprocessing_output = self.result_dir + '/preprocessing/SOAPnuke_output.fastq' if self.conf['preprocessing']['fastqc']['enable'] != False: preprocessing_obj.input_file = self.preprocessing_output os.mkdir( os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing/fastqc_after_filtering') preprocessing_obj.fastqc_single_end( 'fastqc_after_filtering') else: self.preprocessing_output = self.input_file temp_file = open( os.path.abspath('.') + '/' + self.result_dir + '/Summary_of_results.html', 'a+') temp_file.write('<ul>\n') temp_file.write('<li>preprocessing result is in %s</li>\n' % './preprocessing') temp_file.write('<li><a href="%s">click to report</a></li>\n' % ('./preprocessing/')) temp_file.write('</ul>\n') temp_file.close() else: self.preprocessing_output = self.input_file if self.conf['identification']['enable'] != False: os.mkdir( os.path.abspath('.') + '/' + self.result_dir + '/' + 'identification') resequencing_obj = annotation.Resequencing( self.result_dir, self.conf, self.preprocessing_output) resequencing_obj.run()
def get_turkish_dict_defs(dict_path): """ This function extracts the dictionary definitions of words in Turkish. :param dict_path: The path to the Turkish dictionary that is in parsed and disambiguated form. :type dict_path: str :return: A dict containing words and their dictionary definitions. :rtype: dict """ pre = preprocessing.Preprocessing() all_mwes = pre.get_turkish_idioms() dict_definitions = {} for filename in os.listdir(dict_path): file = os.path.join(dict_path, filename) with open(file, "r") as f: meaning = False dict_entry_word = "" for line in f: line = line.lower() if line[0] == "[" or line[0] == "]": continue if WORD_SEP in line: # The below if block performs some preprocessing operations. if dict_entry_word in dict_definitions: # The below line of code could have been optimised. dict_def = " ".join( list(dict_definitions[dict_entry_word])) upd_def = pre.replace_mwe(dict_def, all_mwes) upd_def = pre.capture_and_update_consec_negs(upd_def) dict_definitions[dict_entry_word] = set(upd_def) dict_entry_word = "" meaning = False continue if MEANING_MARK in line: meaning = True if dict_entry_word[-1] == '_': dict_entry_word = dict_entry_word[:-1] dict_entry_word = remove_noise_char(dict_entry_word) if dict_entry_word not in dict_definitions: dict_definitions[dict_entry_word] = set([]) # dict_definitions[dict_entry_word].add(dict_entry_word) continue if not meaning and "[" in line: if "lH[Adj+With]" in line.split()[1]: dict_entry_word += line.split()[0] else: dict_entry_word += line.split("[")[0].split()[1] elif meaning and "[" in line: meaning_word = line.split("[")[0].split()[1] meaning_word = remove_noise_char(meaning_word) turk_dict_word_freqs[meaning_word] += 1 dict_definitions[dict_entry_word].add(meaning_word) dict_definitions = lexical_interface.get_dict_except_most_and_least_freq( turk_dict_word_freqs, dict_definitions) return dict_definitions
train_char_data = ld.load_train_char_data() print ('train_char_data.shape: ' + str(train_char_data.shape)) train_data_class_annotation, unique_edit_trees_from_train_data = ld.load_class_annotation_and_unique_edit_trees_from_train_data() print ('train_data_class_annotation: ' + str(train_data_class_annotation.shape)) print ('size of unique_edit_trees_from_train_data: ' + str(len(unique_edit_trees_from_train_data))) train_data_applicable_trees = ld.load_applicable_trees_data(train_file) print ('train_data_applicable_trees: ' + str(train_data_applicable_trees.shape)) train_data_global_vectors = ld.load_global_word_vectors(train_file, global_vector_file) print ('train_data_global_vectors.shape: ' + str(train_data_global_vectors.shape)) nb_tree_classes = len(unique_edit_trees_from_train_data) word_vector_size = int(train_data_global_vectors.shape[1]) embedded_char_vector_length = len(preprocessing.Preprocessing(train_file).get_char_dic()) #***************************** Training Phase Start *****************************# #********************** Building Char Model *************************# # no_of_words = int(train_char_data.shape[0]) char_model = models.get_char_model(char_network_cell, max_word_length, embedded_char_vector_length, char_feature_output) print('char model summary:') print(char_model.summary()) char_model_file_name = train_file + '_char_level_' + char_network_cell + '.h5' if os.path.isfile(os.path.join('./', char_model_file_name)): char_model.load_weights(os.path.join('./', char_model_file_name)) print ('loaded ' + char_model_file_name + ' from disk') else:
import preprocessing as pp import classifier as cc import visualizer as vs from sklearn import datasets from sklearn.model_selection import train_test_split import numpy as np import pandas as pd cols = [i for i in range(0, 32)] df = pd.read_csv('breast_cancer.csv', index_col=32) df = df.drop('id', axis=1) df = pp.Preprocessing().handleMissing(df) X = df.drop('diagnosis', axis=1) X = pp.Preprocessing().scale(X.values, type=pp.STANDARD_SCALER) y = df['diagnosis'] y = pp.Preprocessing().encode(y.values, type=pp.LABEL_ENCODER) X_train, X_test, y_train, y_test = train_test_split(df.drop('diagnosis', axis=1), y, random_state=0, test_size=0.33) # Random Forest params = dict(n_estimators=95, random_state=0, criterion='gini') classifier = cc.Classifier(type=cc.RANDOM_FOREST, **params) classifier.fit(X_train, y_train) print("******************Random Forest******************") print(classifier.score(X_test, y_test)) print("*************************************************")