def add_doc(docs, doc_path, category_words, term_freq_db, doc_freq_db): ''' Add document in database Input : documents in database, path of document, words in the document category, word to document index of that category, term_freq_db dictionary term_freq_db is a dictionary of dictionaries which stores frequency of each word in every document doc_freq_db is a dictionary that stores the id of documents in which the word exists. Returns Nothing ''' curr_id = len(docs) docs.append(curr_id) curr_word_freq = pre_process(doc_path) #Word to frequency of document doc = give_name(doc_path) term_freq_db[doc] = dict() for word in curr_word_freq : category_words.add(word) if word in term_freq_db[doc] : term_freq_db[doc][word] += 1 else : term_freq_db[doc][word] = 1 for word in set(curr_word_freq) : if word in doc_freq_db : doc_freq_db[word] += 1 else : doc_freq_db[word] = 1 print(f'Added doument {curr_id} to database') return curr_id
def give_docs(path, category_words_docs, threshold): ''' Gives the category of a document given word_freq of that document. Inputs : path of document, category_words_docs, threshold (Intersection ratio) word_freq_doc has frequency of words in the document category_words_docs is a dictionary that stores set of words in a category and list of documents of that category, for every category in the database prints probable categories of document Returns : List docs, word_freq_doc ''' processed_doc = pre_process(path) word_freq_doc = generate_word_freq(processed_doc) words = set([word for word in word_freq_doc]) docs = [] categories = [] for category in category_words_docs: intersection_ratio = len( words.intersection(category_words_docs[category][0])) / len(words) print(f'Intersection ratio with {category} is {intersection_ratio}') if intersection_ratio >= threshold: categories.append(category) docs.append((category_words_docs[category][1], category)) print(f'Document belongs to {categories} category') return docs, word_freq_doc
def scan_plagiarism(path, category_words_docs, id_name, threshold): ''' Inputs = Path of test document, category_word_docs dictionary, id_name dictionary and threshold for category recognition Returns = Top five documents from which plagiarism may have occured and correspondong similarity score ''' docs, word_freq_doc = give_docs(path, category_words_docs, threshold) doc = give_name(path) words_list = pre_process(path) term_freq = dict() #Calculating term frequency of test document for word in words_list: if word in term_freq: term_freq[word] += 1 else: term_freq[word] = 1 print("PLAGIARISM SCORES \n") d = calc_score(docs, term_freq, term_freq_db, doc_freq_db, tf_idf_db, 2318) d = dict(sorted(d.items(), key=operator.itemgetter(1), reverse=True)) sim_score = dict(list(d.items())[0:10]) for db_doc in sim_score: print(db_doc, sim_score[db_doc]) print("\n") return sim_score
def get_preprocessed_texts(df): """ Returns a series with the pre processed documents. :param location: pandas.DataFrame Containing the texts in the "text" column. :return: pandas.Series with the pre processed documents. """ return df.text.copy().map(lambda text: pre_process(text))
image_amount = len(X_test) feature_amount = len(X_test[0]) print("images {}, features {}".format(image_amount, feature_amount)) #X_train, X_test, y_train, y_test = pre_process(X, y) #SVM_recommend_run("PCA", X_train, X_test, y_train, y_test, {'pca':a}) SVM_paras = {'C': 0.01, 'max_iter': 2000} clf = LinearSVC(**SVM_paras) print("start training") clf.fit(X_train, y_train) print("start testing") y_pred = clf.predict(X_test) sc = clf.score(X_test, y_test) f1_sc = f1_score(y_test, y_pred, average='macro') print("score is {}, f1_score is {}".format(sc, f1_sc)) f = open("lda_result.txt", 'a') f.write("{} {} {}".format(a, sc, f1_sc)) f.write('\n') f.close() if __name__ == "__main__": X, y = load_data() X_train, X_test, y_train, y_test = pre_process(X, y) for a in range(2, 128, 2): apply_lda(a, X_train, X_test, y_train, y_test) #apply_lda(1024,X,y) #apply_lda(512,X,y) #apply_lda(256,X,y) #apply_lda(128,X,y) #apply_lda(8,X,y)
from pre_process import * from numpy import log import time from numpy import sqrt import matplotlib.pyplot as plt t1 = time.time() dssp2 = pre_process("./txtfile/dssp_info.txt", "./txtfile/dssp.txt") stride = pre_process("./txtfile/stride_info.txt", "./txtfile/stride.txt") fsr, fs, fr = fSR(dssp2) print(fsr) print(fs) print(fr) # print(fs) # print(fr) total = fs['C'] + fs['E'] + fs['C'] # do some pre_process, read the protein sequence and structure in to a 2d list f = open("./txtfile/dssp_protein") list = f.readlines() list2 = [] for i in range(0, len(list), 3): temp = [] temp.append(list[i + 1].strip()) temp.append(list[i + 2].strip()) temp.append(list[i].strip()) list2.append(temp)
from pre_process import * from numpy import log import matplotlib.pyplot as plt dssp = pre_process("./txtfile/dssp_info.txt", "./txtfile/dssp_protein.txt") stride = pre_process("./txtfile/stride_info.txt", "./txtfile/stride_protein.txt") fsr, fs, fr = fSR(dssp) #print(fs) #print(fr) total = fs['C'] + fs['E'] + fs['C'] # begin gor3 algorithm def gor2(dssp, resultfile): predict2 = [] for i in range(len(dssp)): helix = log(fsr['H'][dssp[i][3]] / (fsr['C'][dssp[i][3]] + fsr['E'][dssp[i][3]])) + log( (fs['E'] + fs['C']) / fs['H']) for j in range(-8, 9): if j != 0: if i + j > 0 and i + j < len(dssp) - 1: # here now use the gor ii t = i + j if dssp[t][0] == dssp[i][0] and dssp[t][1] == dssp[i][1]: helix += log(fsr['H'][dssp[t][3]] / (fsr['C'][dssp[t][3]] +
print(len(train_x)) # Test Data for file in test_dir: mid_file = MidiFile('python_test_set/' + file) file_name = mid_file.filename.split('/')[-1] vector = [] for i, track in enumerate(mid_file.tracks): for msg in track: if hasattr(msg, 'note'): if msg.velocity != 0: vector.append(msg.note) pre_pro_vector = pre_process(vector, model) test_x[file_name] = pre_pro_vector #print(test_x[file_name]) print(len(test_x)) # Creating Classifier classifier = svm.SVC() # Training Our Model # Before Training our Model we should fix our data cause its not on the correct format and order train_x_data = [] train_y_data = [] train_x_keys = list(train_x.keys())