Esempio n. 1
0
def main(argv):
	
	id_list, fasta_list, config_file = '', '', ''

	# Setting the get options method to read the input arguments
	try:
		opts, args = getopt.getopt(argv, "f:i:c:h", ["fasta=", "id=", "config=", "help"])
	except getopt.GetoptError:
		print 'Invalid arguments:\nUsage:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-help]'
		sys.exit(2)

	for opt, arg in opts:
		if opt in ('-h','--help'):
			print "USAGE:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-h]\n\t-i\t\tText file containing protein OMA ids (1 id per line)\n\t-f\t\tList of input protein sequences in fasta format\n\t-c\t\tConfiguration file for setting program's dependencies"
			sys.exit(2)
		elif opt in ('-i', '--id'):
			id_list = arg
		elif opt in ('-f','--fasta'):
			fasta_list = arg
		elif opt in ('-c','--config'):
			config_file = arg
		else:
			print 'Invalid arguments:\nUsage:\tprotTrace.py -i <omaIdsFile> | -f <fastaSeqsFile> -c <configFile> [-help]'
			sys.exit(2)
	
	config_file = os.path.abspath(config_file)
	
	# Calling the class in configure.py module and setting the tool parameters
	proteinParams = configure.setParams(config_file)
	
	if id_list != '':
		for ids in open(id_list):
			print '##### Running for OMA id: %s #####' %ids.split()[0]
			if proteinParams.preprocessing:	
				preprocessing.Preprocessing(ids.split()[0], 'None', config_file)
			if proteinParams.traceability_calculation:
				traceabilityCalculation.main(ids.split()[0], config_file)
			if proteinParams.mapTraceabilitySpeciesTree:
				mapToSpeciesTree.main(ids.split()[0], config_file)
	elif fasta_list != '':
		with open(fasta_list) as fa:
			for seqs in fa:
				if '>' in seqs:
					print '##### Running for fasta id: %s #####' %seqs[1:-1]
					inputId = seqs.split()[0][1:]
					querySeq = fa.next()

				if proteinParams.preprocessing:
					preprocessing.Preprocessing(inputId, querySeq, config_file)
				if proteinParams.traceability_calculation:
					traceabilityCalculation.main(inputId, config_file)
				if proteinParams.mapTraceabilitySpeciesTree:
					mapToSpeciesTree.main(inputId, config_file)
    def get_all_context_4_delta_idf_scores(self, pos_revs, neg_revs):
        """
        This helper method generates four scores on a word-basis using positive and negative reviews.

        :param pos_revs: Positive reviews.
        :type pos_revs: list
        :param neg_revs: Negative reviews.
        :type neg_revs: list
        :return: A dictionary containing words as keys and four sentiment scores for each word as values.
        :rtype: dict
        """

        all_revs = self.get_all_revs(pos_revs, neg_revs)
        all_context_words = preprocessing.Preprocessing(
        ).get_all_context_words(all_revs=all_revs)

        context_delta_idf_scores = self.extract_all_context_delta_idf_scores(
            all_context_words, self.delta_idf_scores)

        all_context_4_delta_idf_scores = {}

        for target_word, word_context_delta_idf_scores in context_delta_idf_scores.items(
        ):
            all_context_4_delta_idf_scores[target_word] = \
                self.get_context_4_delta_idf_scores(target_word,
                                                    word_context_delta_idf_scores,
                                                    self.delta_idf_scores)

        return all_context_4_delta_idf_scores
Esempio n. 3
0
    def set_traning_data(self, img_path, txt_path):
        if op.isfile("./model/traningData.pkl"):
            print("already exist training file")
            return
        print 'make training data...'
        pf = preprocessing.Preprocessing()
        pf.set_img_path(img_path)
        pf.set_txt_path(txt_path)
        tag_list = pf.get_tag_list()

        trX = []
        trY = []

        print 'find coordinate...'
        count = 0
        total = len(tag_list)
        for temp in tag_list:
            lm = landmark.Landmark(img_path + '/' + temp[0] + '.jpg')
            lm.get_face_landmarks()
            if lm.coordinate is None:
                continue
            trX.append(lm.coordinate.reshape((1, 30))[0])
            trY.append(FTYPE[temp[1]])
            count += 1
            sys.stdout.write('\r%d%%' % int(float(count) / float(total) * 100))
            sys.stdout.flush()

        print '\nTotal', count, 'data,', total - count, 'data missing'

        trainingItem = np.array(trX), np.array(trY)
        pickle.dump(trainingItem, open("./model/traningData.pkl", "wb"))
        print 'create traningData.pkl file'
Esempio n. 4
0
 def load_test_char_data(self, test_file):
     Preprocessing = preprocessing.Preprocessing(self.train_file,
                                                 self.max_word_length)
     Preprocessing.generate_char_one_hot_vec_and_num_encoding_for_file(
         test_file, self.divide_file_factor)
     test_file_substring = test_file + "_char_num_encoded"
     test_char_data = self.read_parts_from_file(test_file_substring,
                                                self.max_word_length)
     return test_char_data
Esempio n. 5
0
 def load_global_word_vectors(self, file, vec_file):
     Preprocessing = preprocessing.Preprocessing(self.train_file,
                                                 self.max_word_length)
     length_of_vectors = Preprocessing.process_global_word_vectors(
         file, vec_file, self.divide_file_factor)
     word_vector_file_substring = file + '_word_vectors_from_' + ntpath.basename(
         vec_file)
     word_vectors = self.read_parts_from_file(word_vector_file_substring,
                                              length_of_vectors)
     return word_vectors
Esempio n. 6
0
 def load_train_char_data(self):
     Preprocessing = preprocessing.Preprocessing(self.train_file,
                                                 self.max_word_length)
     Preprocessing.build_char_dic()
     Preprocessing.generate_char_one_hot_vec_and_num_encoding_for_file(
         self.train_file, self.divide_file_factor)
     train_file_substring2 = str(self.train_file) + '_char_num_encoded'
     train_char_data = self.read_parts_from_file(train_file_substring2,
                                                 self.max_word_length)
     return train_char_data
Esempio n. 7
0
 def __init__(self):
   self.tweets = []
   self.preprocessor = Preprocessing.Preprocessing()
   #Stemmer also stems the query content
   stop_words = self.preprocessor.stops()
   analyzer = StemmingAnalyzer(stoplist=stop_words)
   analyzer.cachesize = -1 # Unbounded caching, but worse memory performance
   file("results.txt", "w")
   file("topResults.txt", "w")
   file("term_stats.txt", "w")
   
   self.schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True, analyzer=analyzer))
Esempio n. 8
0
    def move(self):
        # This function is called on every turn of a game. It's how your snake decides where to move.
        # Valid moves are "up", "down", "left", or "right".
        # TODO: Use the information in cherrypy.request.json to decide your next move.
        data = cherrypy.request.json
        info = ppc.Preprocessing(data["board"], data["you"])

        # Choose a random direction to move in
        possible_moves = ["up", "down", "left", "right"]
        move = random.choice(possible_moves)

        print(f"MOVE: {move}")
        return {"move": move}
Esempio n. 9
0
def main():
    """
    The main function.

    :return: This main function builds the model and performs the evaluation.
    :rtype: None
    """

    parser_ = create_parser()
    args = parser_.parse_args()

    constants.COMMAND = args.command
    constants.LANG = args.language
    constants.EMBEDDING_TYPE = args.embedding_type
    constants.EMBEDDING_SIZE = args.embedding_size
    constants.CV_NUMBER = args.cv_number
    constants.USE_3_REV_POL_SCORES = args.use_3_review_polarities
    constants.DATASET_PATH = args.file_path

    constants.TRAINING_FILE = args.training_path
    constants.TEST_FILE = args.test_path

    constants.MODEL_FILE_NAME = args.model_path

    # Cross-validation to be performed on a single dataset.
    if args.command == "cross_validate":
        svm.run_cross_validation_svm(constants.DATASET_PATH)
    # Training and test datasets are provided separately.
    elif args.command == "train_and_test_separately":
        training_file = constants.TRAINING_FILE_PATH
        test_file = constants.TEST_FILE_PATH
        svm.train_and_test_separate_files(training_file, test_file)
    # Only the label of a single review to be typed in the terminal is predicted.
    elif args.command == "predict":
        pre = preprocessing.Preprocessing()
        reviews, labels = pre.get_data(constants.DATASET_PATH)
        model, sf, tr_vecs, imp = svm.generate_model(reviews, labels)
        # The following command could also be used instead of the above two commands.
        # (model, sf, tr_vecs, imp) = pickle.load(open(constants.MODEL_FILE_NAME, "rb"))

        print("Please, enter a text below:")
        line = sys.stdin.readline()
        while line:
            line = line.strip("\n")
            line = pre.preprocess_one_line(line)
            sentiment = svm.test_model(model, sf, tr_vecs, imp, line)[0][0]
            sentiment = "Positive" if sentiment == "P" else "Negative"
            print(sentiment)
            print("Please, enter a text below:")
            line = sys.stdin.readline()
Esempio n. 10
0
def prom_cnn():
    prep = preprocessing.Preprocessing(r'dict', special_token='<UNK>')
    final_embeddings = []
    with open('./matrix/embeddings.txt', 'r', encoding='utf8') as f:
        emb_lines = f.readlines()
        for i in range(len(emb_lines)):
            final_embeddings.append([float(num) for num in emb_lines[i].replace('\n', '').split('\t')[1:]])
    final_embeddings = get_emb_mat(final_embeddings)
    _, tag_vocab = prepare_data(r'./Data/nomenklatura.csv', columnname=['FullName', 'Count'])
    index_to_count = {0: '10,01', 1: '10,02', 2: '10,05', 3: '10,09', 4: '10,06', 5: '10,08', 6: '10,12', 7: '10.10',
                      8: '10.11.01', 9: '10.03', 10: '10.11.02', 11: '10,07', 12: '10,04'}
    cnn = CNN(n_tags=len(tag_vocab), emb_mat=final_embeddings, n_hidden_list=[100, 100, 100])
    # batch = [[prep.word_to_index[i] for i in prep.prepare_data(elem)]]
    # y_pred = index_to_count[cnn.predict(tok=batch)[0]]
    return cnn, prep, index_to_count
Esempio n. 11
0
    def move(self):
        # This function is called on every turn of a game. It's how your snake decides where to move.
        # Valid moves are "up", "down", "left", or "right".
        # TODO: Use the information in cherrypy.request.json to decide your next move.
        data = cherrypy.request.json
        info = ppc.Preprocessing(data["board"], data["you"])

        # Choose a random direction to move in
        possible_moves = ["up", "down", "left", "right"]
        info.get_weights()
        move, shortest_weight, path = info.get_shortest_path(6)
        print("move:", move)
        print("smallest weight:", shortest_weight)
        print("path:", path)

        return {"move": move}
Esempio n. 12
0
    def run(self):
        if  self.conf['preprocessing']['enable'] != False:
            os.mkdir(os.path.abspath('.') + '/' + self.result_dir + '/' + 'preprocessing')
            preprocessing_obj = preprocessing.Preprocessing(self.result_dir, self.conf, input_file=self.input_file)
            if self.conf['preprocessing']['fastqc']['enable'] != False:
                preprocessing_obj.fastqc_single_end('.')
                temp_file = open(os.path.abspath('.') + '/' + self.result_dir+'/Summary_of_results.html', 'a+')
                temp_file.write('<ul>\n')
                temp_file.write('<li>preprocessing result is in %s</li>\n' % './preprocessing')
                temp_file.write('<li><a href="%s">click to report</a></li>\n' % ('./preprocessing/'))
                temp_file.write('</ul>\n')
                temp_file.close()

        if  self.conf['identification']['enable'] != False:
            os.mkdir(os.path.abspath('.') + '/' + self.result_dir + '/' + 'identification')
            resequencing_obj = annotation.Resequencing(self.result_dir, self.conf, self.input_file)
            resequencing_obj.run_3gs()
Esempio n. 13
0
    def test_preprocess_one_image(self):
        p = preprocessing.Preprocessing()
        p.load_files()

        for i in [0, 500, 900, 2000, 3000, 4000, 5000, 6000, 7000]:
            image = p.x[i]
            io.imsave(os.path.join('tmp', 'image{}.png'.format(i)), image)

            processed_image = p.preprocess_image(image)
            io.imsave(os.path.join('tmp', 'image{}_after.png'.format(i)), processed_image)

            variations = p.create_variations(processed_image)
            for j in range(len(variations)):
                io.imsave(
                    os.path.join('tmp', 'image{0}_after_variation{1}.png'.format(i, j)),
                    variations[j]
                )
Esempio n. 14
0
    def test_irisDataOutput(self):
        with open('iris.csv', 'r') as datafile:
            reader = csv.DictReader(datafile, delimiter=';')
            data = []
            for line in reader:
                data.append(line)

            pre = preprocessing.Preprocessing(data)
            normalizedData = pre.normalizeData()

            dbscanner = DBSCAN()
            data = dbscanner.cluster(
                normalizedData, 0.065, 4, 'eucl',
                ['Case', 'class', 'sepal_width', 'sepal_length'])

            writer = output.ClusterImageWriter('iris.csv', 'output')
            writer.writeDBSCANImage(data, 'cluster', 'petal_width',
                                    'petal_length', 4, 0.07)
Esempio n. 15
0
def get_eng_dict_defs(dict_path):
    """
    This function generates dictionary definitions for English.

    :param dict_path: The path to the SentiWordNet lexicon.
    :type dict_path: str
    :return: English dictionary
    :rtype: dict
    """

    dict_defs = {}
    eng_dict_word_freqs = Counter()
    pre = preprocessing.Preprocessing()
    with open(dict_path, "r") as d:
        for line in d:
            if line[0] == "#" or line[0].isspace():
                continue
            line = line.lower()

            line_spl = line.split('\t')

            synsets = line_spl[4].split()

            synsets = [synset[:-2] for synset in synsets if synset[-1] == "1"
                       ]  # The main meaning/synset is captured.

            gloss = line_spl[5]  # The dictionary definition of the entry word.

            gloss_toks = pre.english_tokenize(gloss)

            for gloss_tok in gloss_toks:
                eng_dict_word_freqs[gloss_tok] += 1

            for synset in synsets:
                if synset not in dict_defs:
                    dict_defs[synset] = []
                dict_defs[synset].extend(gloss_toks)
                dict_defs[synset].append(synset)

    dict_defs = lexical_interface.get_dict_except_most_and_least_freq(
        eng_dict_word_freqs, dict_defs)
    return dict_defs
Esempio n. 16
0
def main():
    with open("iris.csv", 'r') as datafile:
        reader = csv.DictReader(datafile, delimiter=';')
        data = []
        for line in reader:
            data.append(line)

        pre = preprocessing.Preprocessing(data)
        normalized_data = pre.normalizeData()

        k_means = kMeans()
        data_kmeans = k_means.cluster(normalized_data, k=3, dist='eucl', centreMethod='rand', filterKeys=['Case', 'class'])
        
        dbscan = DBSCAN()
        data_dbscan = dbscan.cluster(normalized_data, eps=0.3, MinPts=2, dist='eucl', filterKeys=['Case', 'class'])

        #printData(data_dbscan)
        printData(data_kmeans)

        pp = pprint.PrettyPrinter(indent=2)
Esempio n. 17
0
    def __init__(self, dataset_path='', model_id=None, dataset_id=None, train_params=None, workers=cpu_count):
        self.workers = workers
        self.pp = preprocessing.Preprocessing()
        self.dataset_id = dataset_id
        self.model = None

        if model_id:
            self.model_id = str(model_id)
            print('Model {} loaded'.format(self.model_id))
            self.model = self.load_model_from_disk()
            self.model_vocab = set(self.model.wv.vocab.keys())
            self.dataset_interface = DatasetInterface(self.model_id)

        if self.dataset_id:
            self.dataset_interface = DatasetInterface(self.dataset_id)
            self.train_data = self.prepare_doc2vec_train_data(self.dataset_interface.dataset)
            # self.train_data = self.prepare_doc2vec_train_data_int(self.dataset_interface.dataset)
            self.init_and_train(**train_params)
        else:
            print('No params')
    def test_irisDataOutput(self):
        with open('iris.csv', 'r') as datafile:
            reader = csv.DictReader(datafile, delimiter=';')
            data = []
            for line in reader:
                data.append(line)

            pre = preprocessing.Preprocessing(data)
            normalizedData = pre.normalizeData()

            k_means = kMeans()
            k_means.cluster(normalizedData,
                            k=3,
                            dist='eucl',
                            centreMethod='rand',
                            filterKeys=['Case', 'class'])

            writer = output.ClusterImageWriter('iris.csv', 'output')
            writer.writeKMeansImages(k_means.iterData, k_means.iterCentres,
                                     'cluster', 'dist2clu', 'petal_width',
                                     'petal_length')
            writer.writeGif()
Esempio n. 19
0
def extract_sample(file_, size=60, sex=False, out_size=9):
    # Preprocess file ...
    x = prep.Preprocessing(file_, size)
    x.start_point_detection(threshold=0.5, n=10)
    x.cut_first_max(n=20)
    x.normalize()
    x.fit()
    x.get_subset('static')

    num = int(re.search(r'(?=.*)[0-9](?=.*)', file_).group(0))

    # make a column of the whole array
    features = x.data.reshape((len(x.data) * len(x.data[0]), 1))

    if sex:
        if re.search(r'.*woman.*', file_):
            labels = vectorize_output(num - 1 + out_size,
                                      shape=(out_size * 2, 1))
        else:
            labels = vectorize_output(num - 1, shape=(out_size * 2, 1))
    else:
        labels = vectorize_output(num - 1, shape=(out_size, 1))

    return (features, labels)
Esempio n. 20
0
                os.remove(self.model_path + "checkpoint")
                return
            if len(batch) > remain:
                for bat in sorted(batch)[:-(remain)]:
                    for file in filelists:
                        if str(bat) in file and "chatbot_seq2seq" in file:
                            os.remove(self.model_path + file)
        except Exception as e:
            return


if __name__ == '__main__':
    seq = Seq2seq()
    if sys.argv[1]:
        if sys.argv[1] == 'server':
            pp = preprocessing.Preprocessing()
            print("server run.. ")
            soc = socket.socket(socket.AF_INET,
                                socket.SOCK_STREAM)  # Create a socket object
            host = "127.0.0.1"  # Get local machine name
            port = 1994  # Reserve a port for your service.
            soc.bind((host, port))  # Bind to the port
            soc.listen(5)  # Now wait for client connection.
            while True:
                conn, addr = soc.accept()  # Establish connection with client.
                print("Got connection from", addr)
                question = conn.recv(1024)
                relation = pp.term_relationship(question.decode("utf-8"), 10)
                answer = seq.predict(
                    relation if relation != "" else str(question))
                print(answer)
        "y": 55
    },
    "EM": {
        "x": 85,
        "y": 85
    },
    "TNBC": {
        "x": 20,
        "y": 20
    },
    "ssTEM": {
        "x": 32,
        "y": 32
    },
}

print("----Starting Data Download-----")
preprocess = preprocessing.Preprocessing(selections=selections,
                                         datasets=datasets,
                                         k_shots=k_shots,
                                         target_dir=target_dir)
subprocess.call(["sh", "./downloadUnzipDatasets.sh"])
print("-----Finished Data Download-----")
extractFewShotTargetSelections.extractFewShotTargetSelection()
print("-----Preprocessing Few-Shot Target Selections-----")
preprocess.reprocessFTandTestSamples(crop_steps_dataset=ft_crop_steps_dataset,
                                     remove_black_images=True)
print("-----Preprocessing Source Datasets-----")
preprocess.preprocess_Source_Data(crop_steps_dataset=source_crop_steps_dataset,
                                  remove_black_images=True)
Esempio n. 22
0
  - perform the training of model chosen,
  - save the models,
  - perform a prediction and save it in a file in format csv,
  - evaluate the prediction with the test set provided (blind):
      - compute and plot the confusion matrix,
      - print the report of classification,
      - compute and print the accuracy of models,
  ####################################################################################################
'''
'''
Preprocessing:
In this section the preprocessing is performed:
- obtain the features with two kind of preprocessing adopted: unigrams and bigrams with occurence,
- the models are instanced with different kind of algorithm availabled in class 'Model'
'''
Preprocessing_data = preprocessing.Preprocessing(
)  #instance of preprocessing data
Preprocessing_prediction = preprocessing.PreprocessingPrediction(
)  #instance of preprocessing for prediciton phase

kind_preprocessing_unigrams = Preprocessing_data.unigrams  #kind of preprocessing unigrams
kind_preprocessing_bigrams = Preprocessing_data.bigrams  #kind of preprocessing bigrams
kind_label_opt = Preprocessing_data.opt  #kind of label used for binary class: opt
kind_label_compiler = Preprocessing_data.compiler  #kind of label used for multi-class class: compiler
'''
The choice of Algorithm:
In oder to use the algorithm in main is need to instance the desidered algorithm 
from the following list:
- Naive Bayes Multinomial algorithm => NaiveBayesMultinomial() 
- Naive Bayes Bernoulli algorithm => NaiveBayesMBernoulli()
- SVC algorithm with 'rbf' as kernel => RbfSVC()
- SVC algorithm with 'Linear' as kernel => LinearSVC()
Esempio n. 23
0
 def setUp(self):
     self.p = preprocessing.Preprocessing()
     self.p.load_files()
     self.c = Classifier('extra_trees')
Esempio n. 24
0
    def clustering(self):
        # check input validity
        if not self.filename:
            self.error_text.config(text="Choose a file first")
            return
        if self.algorithm.get() == 'kmeans' and not is_int(self.k_entry.get()):
            self.error_text.config(text="Select k")
            return
        if self.algorithm.get() == 'dbscan' and not is_int(
                self.min_pts_entry.get()) and not is_float(
                    self.eps_entry.get()):
            self.error_text.config(text="Select MinPts and Eps")
            return

        self.error_text.config(text="")

        filteredKeys = []
        selectedKeys = []

        i = 0

        while i < len(self.boxes):
            if self.boxes[i].get() == 0:
                filteredKeys.append(self.attributes[i])
            else:
                selectedKeys.append(self.attributes[i])
            i += 1

        pre = preprocessing.Preprocessing(self.data)
        #pre.removeAttributes(["Att1", "Att3", "For example"])
        normalized_data = pre.normalizeData()

        clustered_data = None
        k_means = kMeans()
        dbscan = DBSCAN()

        if self.algorithm.get() == 'kmeans':
            clustered_data = k_means.cluster(normalized_data,
                                             k=self.k_entry.get(),
                                             dist=self.dist.get(),
                                             centreMethod='rand',
                                             filterKeys=filteredKeys)

        else:
            clustered_data = dbscan.cluster(normalized_data,
                                            eps=float(self.eps_entry.get()),
                                            MinPts=int(
                                                self.min_pts_entry.get()),
                                            dist=self.dist.get(),
                                            filterKeys=filteredKeys)

        with open(self.algorithm.get() + ".csv", "w") as outfile:
            keysExist = False
            csvwriter = csv.writer(outfile, delimiter=';')
            for line in clustered_data:
                keys = []
                values = []
                for key, value in line.items():
                    keys.append(key)
                    values.append(value)
                if not keysExist:
                    csvwriter.writerow(keys)
                    keysExist = True
                csvwriter.writerow(values)

            #remove old gif
            if self.updater is not None:
                self.master.after_cancel(self.updater)
                self.updater = None

            # Create imagefile and show image in UI
            imagefile = self.algorithm.get()
            clustWriter = ClusterImageWriter(
                imagefile,
                datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S'))
            picname = ""
            gifname = ""
            if self.algorithm.get() == 'kmeans':
                names = clustWriter.writeKMeansImages(k_means.iterData,
                                                      k_means.iterCentres,
                                                      'cluster', 'dist2clu',
                                                      selectedKeys[0],
                                                      selectedKeys[1])
                gifname = clustWriter.writeGif()
                picname = names.pop()
            else:
                picname = clustWriter.writeDBSCANImage(clustered_data,
                                                       'cluster',
                                                       selectedKeys[0],
                                                       selectedKeys[1], 4,
                                                       0.07)
                gifname = clustWriter.writeGif()

            pic = PhotoImage(file=picname)
            set_image(self.canvas, pic)

            if self.updater is None:
                self.updater = self.master.after(0, self.update_gif,
                                                 self.gifcanvas,
                                                 get_frames(gifname), 0)
    def run_single(self):
        if self.conf['preprocessing']['enable'] != False:
            os.mkdir(
                os.path.abspath('.') + '/' + self.result_dir + '/' +
                'preprocessing')
            preprocessing_obj = preprocessing.Preprocessing(
                self.result_dir, self.conf, input_file=self.input_file)

            if self.conf['preprocessing']['fastp']['enable'] != False:
                preprocessing_obj.fastp_single_end()
                self.preprocessing_output = self.result_dir + '/preprocessing/fastp_output.fastq'
            else:
                if self.conf['preprocessing']['fastqc']['enable'] != False:
                    os.mkdir(
                        os.path.abspath('.') + '/' + self.result_dir + '/' +
                        'preprocessing/fastqc_before_filtering')
                    preprocessing_obj.fastqc_single_end(
                        'fastqc_before_filtering')
                if self.conf['preprocessing']['trimmomatic']['enable'] != False:
                    preprocessing_obj.trimmomatic_single_end()
                    self.preprocessing_output = self.result_dir + '/preprocessing/trimmomatic_output.fastq'
                    if self.conf['preprocessing']['fastqc']['enable'] != False:
                        preprocessing_obj.input_file = self.preprocessing_output
                        os.mkdir(
                            os.path.abspath('.') + '/' + self.result_dir +
                            '/' + 'preprocessing/fastqc_after_filtering')
                        preprocessing_obj.fastqc_single_end(
                            'fastqc_after_filtering')
                elif self.conf['preprocessing']['cutadapt']['enable'] != False:
                    preprocessing_obj.cutadapt_single_end()
                    self.preprocessing_output = self.result_dir + '/preprocessing/cutadapt_output.fastq'
                    if self.conf['preprocessing']['fastqc']['enable'] != False:
                        preprocessing_obj.input_file = self.preprocessing_output
                        os.mkdir(
                            os.path.abspath('.') + '/' + self.result_dir +
                            '/' + 'preprocessing/fastqc_after_filtering')
                        preprocessing_obj.fastqc_single_end(
                            'fastqc_after_filtering')
                elif self.conf['preprocessing']['sickle']['enable'] != False:
                    preprocessing_obj.sickle_single_end()
                    self.preprocessing_output = self.result_dir + '/preprocessing/sickle_output.fastq'
                    if self.conf['preprocessing']['fastqc']['enable'] != False:
                        preprocessing_obj.input_file = self.preprocessing_output
                        os.mkdir(
                            os.path.abspath('.') + '/' + self.result_dir +
                            '/' + 'preprocessing/fastqc_after_filtering')
                        preprocessing_obj.fastqc_single_end(
                            'fastqc_after_filtering')
                elif self.conf['preprocessing']['SOAPnuke']['enable'] != False:
                    preprocessing_obj.soapnuke_single_end()
                    self.preprocessing_output = self.result_dir + '/preprocessing/SOAPnuke_output.fastq'
                    if self.conf['preprocessing']['fastqc']['enable'] != False:
                        preprocessing_obj.input_file = self.preprocessing_output
                        os.mkdir(
                            os.path.abspath('.') + '/' + self.result_dir +
                            '/' + 'preprocessing/fastqc_after_filtering')
                        preprocessing_obj.fastqc_single_end(
                            'fastqc_after_filtering')
                else:
                    self.preprocessing_output = self.input_file
            temp_file = open(
                os.path.abspath('.') + '/' + self.result_dir +
                '/Summary_of_results.html', 'a+')
            temp_file.write('<ul>\n')
            temp_file.write('<li>preprocessing result is in %s</li>\n' %
                            './preprocessing')
            temp_file.write('<li><a href="%s">click to report</a></li>\n' %
                            ('./preprocessing/'))
            temp_file.write('</ul>\n')
            temp_file.close()
        else:
            self.preprocessing_output = self.input_file

        if self.conf['identification']['enable'] != False:
            os.mkdir(
                os.path.abspath('.') + '/' + self.result_dir + '/' +
                'identification')
            resequencing_obj = annotation.Resequencing(
                self.result_dir, self.conf, self.preprocessing_output)
            resequencing_obj.run()
def get_turkish_dict_defs(dict_path):
    """
    This function extracts the dictionary definitions of words in Turkish.

    :param dict_path: The path to the Turkish dictionary that is in parsed and disambiguated form.
    :type dict_path: str
    :return: A dict containing words and their dictionary definitions.
    :rtype: dict
    """

    pre = preprocessing.Preprocessing()

    all_mwes = pre.get_turkish_idioms()

    dict_definitions = {}

    for filename in os.listdir(dict_path):
        file = os.path.join(dict_path, filename)
        with open(file, "r") as f:
            meaning = False
            dict_entry_word = ""
            for line in f:
                line = line.lower()
                if line[0] == "[" or line[0] == "]":
                    continue
                if WORD_SEP in line:
                    # The below if block performs some preprocessing operations.
                    if dict_entry_word in dict_definitions:
                        # The below line of code could have been optimised.
                        dict_def = " ".join(
                            list(dict_definitions[dict_entry_word]))
                        upd_def = pre.replace_mwe(dict_def, all_mwes)
                        upd_def = pre.capture_and_update_consec_negs(upd_def)
                        dict_definitions[dict_entry_word] = set(upd_def)
                    dict_entry_word = ""
                    meaning = False
                    continue

                if MEANING_MARK in line:
                    meaning = True
                    if dict_entry_word[-1] == '_':
                        dict_entry_word = dict_entry_word[:-1]
                    dict_entry_word = remove_noise_char(dict_entry_word)
                    if dict_entry_word not in dict_definitions:
                        dict_definitions[dict_entry_word] = set([])
                        # dict_definitions[dict_entry_word].add(dict_entry_word)
                    continue

                if not meaning and "[" in line:
                    if "lH[Adj+With]" in line.split()[1]:
                        dict_entry_word += line.split()[0]
                    else:
                        dict_entry_word += line.split("[")[0].split()[1]

                elif meaning and "[" in line:
                    meaning_word = line.split("[")[0].split()[1]
                    meaning_word = remove_noise_char(meaning_word)
                    turk_dict_word_freqs[meaning_word] += 1
                    dict_definitions[dict_entry_word].add(meaning_word)

    dict_definitions = lexical_interface.get_dict_except_most_and_least_freq(
        turk_dict_word_freqs, dict_definitions)
    return dict_definitions
Esempio n. 27
0
train_char_data = ld.load_train_char_data()
print ('train_char_data.shape: ' + str(train_char_data.shape))

train_data_class_annotation, unique_edit_trees_from_train_data = ld.load_class_annotation_and_unique_edit_trees_from_train_data()
print ('train_data_class_annotation: ' + str(train_data_class_annotation.shape))
print ('size of unique_edit_trees_from_train_data: ' + str(len(unique_edit_trees_from_train_data)))

train_data_applicable_trees = ld.load_applicable_trees_data(train_file)
print ('train_data_applicable_trees: ' + str(train_data_applicable_trees.shape))

train_data_global_vectors = ld.load_global_word_vectors(train_file, global_vector_file)
print ('train_data_global_vectors.shape: ' + str(train_data_global_vectors.shape))

nb_tree_classes = len(unique_edit_trees_from_train_data)
word_vector_size = int(train_data_global_vectors.shape[1])
embedded_char_vector_length = len(preprocessing.Preprocessing(train_file).get_char_dic())

#***************************** Training Phase Start *****************************#

#********************** Building Char Model *************************#

# no_of_words = int(train_char_data.shape[0])
char_model = models.get_char_model(char_network_cell, max_word_length, embedded_char_vector_length, char_feature_output)
print('char model summary:')
print(char_model.summary())

char_model_file_name = train_file + '_char_level_' + char_network_cell + '.h5'
if os.path.isfile(os.path.join('./', char_model_file_name)):
    char_model.load_weights(os.path.join('./', char_model_file_name))
    print ('loaded ' + char_model_file_name + ' from disk')
else:
Esempio n. 28
0
import preprocessing as pp
import classifier as cc
import visualizer as vs
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

cols = [i for i in range(0, 32)]
df = pd.read_csv('breast_cancer.csv', index_col=32)
df = df.drop('id', axis=1)
df = pp.Preprocessing().handleMissing(df)
X = df.drop('diagnosis', axis=1)
X = pp.Preprocessing().scale(X.values, type=pp.STANDARD_SCALER)
y = df['diagnosis']
y = pp.Preprocessing().encode(y.values, type=pp.LABEL_ENCODER)

X_train, X_test, y_train, y_test = train_test_split(df.drop('diagnosis',
                                                            axis=1),
                                                    y,
                                                    random_state=0,
                                                    test_size=0.33)

# Random Forest
params = dict(n_estimators=95, random_state=0, criterion='gini')
classifier = cc.Classifier(type=cc.RANDOM_FOREST, **params)
classifier.fit(X_train, y_train)
print("******************Random Forest******************")
print(classifier.score(X_test, y_test))
print("*************************************************")