def rake(filePath): ''' Main function of our project params: filePath | string : Path of file which we have to read. return: keywordsList | string : List of index keywords. ''' rawText = readFile.readFile(filePath) preObj = preprocessing.Preprocess() candidateKeywordList = preObj.preprocess(rawText) indexKeywordList = postprocessing.postprocess(candidateKeywordList) return indexKeywordList
import preprocessing from A1 import model_svm_a1 as svm_a1 from A2 import model_svm_a2 as svm_a2 from B1 import model_cnn_b1 as cnn_b1 from B2 import model_cnn_b2 as cnn_b2 # ====================================================================================================================== # Data preprocessing pre = preprocessing.Preprocess() # Preprocess celeba dataset, 68 landmark features extraction img_train_gender, label_train_gender, img_train_emo, label_train_emo, img_val_gender, label_val_gender, img_val_emo, label_val_emo = pre.preprocess_celeba( False) # Preprocess cartoon dataset, image preprocess train_imgs_cartoon, train_labels_cartoon, val_imgs_cartoon, val_labels_cartoon = pre.preprocess_cartoon( False) # Additional test dataset img_test_gender, label_test_gender, img_test_emo, label_test_emo = pre.preprocess_celeba( True) test_imgs_cartoon, test_labels_cartoon = pre.preprocess_cartoon(True) # ====================================================================================================================== # Task A1 model_A1 = svm_a1.Utils_A1() acc_A1_train, clf_gender = model_A1.train(img_train_gender, label_train_gender) acc_A1_val = model_A1.test(clf_gender, img_val_gender, label_val_gender) #Additional test dataset acc_A1_test = model_A1.test(clf_gender, img_test_gender, label_test_gender) #
data.to_csv('variants_encoded_only_VUS.csv', index=False) return data #Write csv from pandas dataframe without encoding def write_not_encoded(data): # data.to_csv('variants_not_encoded_only_VUS.csv', index=False) data.to_csv('variants_not_encoded.csv', index=False) #Data Insertion pp = preprocessing.Preprocess( data_path_c='dict2csv/variants.csv', read_dtype={ 'motif.ehipos': object, 'motif.ename': object, 'cadd.istv': object }, autoEliminateNullColumns=False, autoImpute=False) #https://stackoverflow.com/a/27232309/8149411 data = pp.getData() data = data.astype({ 'motif.ehipos': np.bool, 'motif.ename': np.bool, 'cadd.istv': str }) #Drop id and url columns pp.dropCols( ['_id', 'cadd._license', 'clinvar._license', 'clinvar.rsid', '_score']) data = pp.getData()
import os import preprocessing # Konlpy tokenizer from konlpy.tag import Okt, Komoran, Kkma okt = Okt() komoran = Komoran() kkma = Kkma() # ex) 'keyValue=12341234' 부여받은 숫자로 교체하세요 ndsl = NDSL('keyValue=12341234') # ex) input: '드론 증강현실 드론 자율주행' -> query = ['드론','증강현실','드론','자율주행'] query = input().split() load = 'patent_' #특허 받아오기 ndsl.getPatent(query, load) #특허 정리 etc.sortPatent([query[0]], load, 'temp') #특허 초록만 정리 try: os.mkdir('abstract') except: pass etc.getAb(query[0], load, 'abstract/patent_') # 토크나이징 pre = preprocessing.Preprocess(okt) pre.getToken([query[0]], 'abstract/patent_', 'abstract/patent_')
# -*- coding: utf-8 -*- import preprocessing from sklearn.linear_model import LogisticRegression pp = preprocessing.Preprocess('veriler.csv') pp.dropCols(['ulke']) pp.dropRows([0,1,2,3,4]) pp.scale([0,1,2]) #pp.print() x_train, x_test, y_train, y_test = pp.trainTestSplitting(['cinsiyet']) log_r = LogisticRegression(solver='lbfgs',random_state=0) log_r.fit(x_train, y_train.values.ravel()) y_pred = log_r.predict(x_test) print(y_pred) print(y_test.values.ravel()) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test,y_pred) print(cm)
# -*- coding: utf-8 -*- from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB from sklearn.metrics import confusion_matrix import preprocessing pp = preprocessing.Preprocess('src/veriler.csv') pp.dropCols(['ulke']) #pp.encode() data = pp.getData() x_train, x_test, y_train, y_test = pp.trainTestSplitting(['cinsiyet']) y_train = y_train.values.ravel() gnb = GaussianNB() #continous dağılımlı veriler içeriyorsa gnb.fit(x_train, y_train) y_pred_gnb = gnb.predict(x_test) cm_gnb = confusion_matrix(y_test, y_pred_gnb) print('GaussianNB:\n', cm_gnb) mnb = MultinomialNB() #nominal yani kesikli dağılım içeriyorsa mnb.fit(x_train, y_train) y_pred_mnb = mnb.predict(x_test) cm_mnb = confusion_matrix(y_test, y_pred_mnb) print('MultiominalNB:\n', cm_mnb) compnb = ComplementNB( ) #Hedef sınıf nominal veriler içeriyorsa ve dengesizse (imbalanced) Özellikle text classificationda Complement > Multinominal compnb.fit(x_train, y_train) y_pred_compnb = compnb.predict(x_test) cm_compnb = confusion_matrix(y_test, y_pred_compnb) print('ComplementNB:\n', cm_compnb)