コード例 #1
0
def add_doc(docs, doc_path, category_words,  term_freq_db, doc_freq_db):
	'''
	Add document in database

	Input : documents in database, path of document, words in the document category,
	word to document index of that category, term_freq_db dictionary

	term_freq_db is a dictionary of dictionaries which stores frequency of each word in every document
	doc_freq_db is a dictionary that stores the id of documents in which the word exists.

	Returns Nothing 
	'''
	curr_id = len(docs)
	docs.append(curr_id)

	curr_word_freq = pre_process(doc_path)  #Word to frequency of document
	doc = give_name(doc_path)
	term_freq_db[doc] = dict()
	for word in curr_word_freq :
		category_words.add(word)
		if word in term_freq_db[doc] :
			term_freq_db[doc][word] += 1
		else :
			term_freq_db[doc][word] = 1

	for word in set(curr_word_freq) :
		if word in doc_freq_db :
			doc_freq_db[word] += 1
		else :
			doc_freq_db[word] = 1
	
	print(f'Added doument {curr_id} to database')
	return curr_id
コード例 #2
0
def give_docs(path, category_words_docs, threshold):
    '''
	Gives the category of a document given word_freq of that document.

	Inputs : path of document, category_words_docs, threshold (Intersection ratio)

	word_freq_doc has frequency of words in the document
	category_words_docs is a dictionary that stores set of words in a category and list of documents of that category, for every category in the database 

	prints probable categories of document

	Returns : List docs, word_freq_doc 
	'''
    processed_doc = pre_process(path)
    word_freq_doc = generate_word_freq(processed_doc)
    words = set([word for word in word_freq_doc])
    docs = []
    categories = []
    for category in category_words_docs:
        intersection_ratio = len(
            words.intersection(category_words_docs[category][0])) / len(words)
        print(f'Intersection ratio with {category} is {intersection_ratio}')
        if intersection_ratio >= threshold:
            categories.append(category)
            docs.append((category_words_docs[category][1], category))
    print(f'Document belongs to {categories} category')
    return docs, word_freq_doc
コード例 #3
0
def scan_plagiarism(path, category_words_docs, id_name, threshold):
    '''
	Inputs = Path of test document, category_word_docs dictionary, id_name dictionary and threshold for category recognition
	Returns = Top five documents from which plagiarism may have occured and correspondong similarity score  
	'''
    docs, word_freq_doc = give_docs(path, category_words_docs, threshold)

    doc = give_name(path)
    words_list = pre_process(path)
    term_freq = dict()

    #Calculating term frequency of test document
    for word in words_list:
        if word in term_freq:
            term_freq[word] += 1
        else:
            term_freq[word] = 1

    print("PLAGIARISM SCORES \n")
    d = calc_score(docs, term_freq, term_freq_db, doc_freq_db, tf_idf_db, 2318)
    d = dict(sorted(d.items(), key=operator.itemgetter(1), reverse=True))
    sim_score = dict(list(d.items())[0:10])
    for db_doc in sim_score:
        print(db_doc, sim_score[db_doc])
        print("\n")

    return sim_score
コード例 #4
0
def get_preprocessed_texts(df):
    """
    Returns a series with the pre processed documents.

    :param location: pandas.DataFrame Containing the texts in the "text" column.
    :return:         pandas.Series    with the pre processed documents.
    """
    return df.text.copy().map(lambda text: pre_process(text))
コード例 #5
0
    image_amount = len(X_test)
    feature_amount = len(X_test[0])
    print("images {}, features {}".format(image_amount, feature_amount))
    #X_train, X_test, y_train, y_test = pre_process(X, y)
    #SVM_recommend_run("PCA", X_train, X_test, y_train, y_test, {'pca':a})
    SVM_paras = {'C': 0.01, 'max_iter': 2000}
    clf = LinearSVC(**SVM_paras)
    print("start training")
    clf.fit(X_train, y_train)
    print("start testing")
    y_pred = clf.predict(X_test)
    sc = clf.score(X_test, y_test)
    f1_sc = f1_score(y_test, y_pred, average='macro')
    print("score is {}, f1_score is {}".format(sc, f1_sc))
    f = open("lda_result.txt", 'a')
    f.write("{} {} {}".format(a, sc, f1_sc))
    f.write('\n')
    f.close()


if __name__ == "__main__":
    X, y = load_data()
    X_train, X_test, y_train, y_test = pre_process(X, y)
    for a in range(2, 128, 2):
        apply_lda(a, X_train, X_test, y_train, y_test)
    #apply_lda(1024,X,y)
    #apply_lda(512,X,y)
    #apply_lda(256,X,y)
    #apply_lda(128,X,y)
    #apply_lda(8,X,y)
コード例 #6
0
ファイル: gor3.py プロジェクト: handreazz/bioinformatics
from pre_process import *
from numpy import log
import time
from numpy import sqrt
import matplotlib.pyplot as plt

t1 = time.time()
dssp2 = pre_process("./txtfile/dssp_info.txt", "./txtfile/dssp.txt")
stride = pre_process("./txtfile/stride_info.txt", "./txtfile/stride.txt")

fsr, fs, fr = fSR(dssp2)

print(fsr)
print(fs)
print(fr)
# print(fs)
# print(fr)

total = fs['C'] + fs['E'] + fs['C']

# do some pre_process, read the protein sequence and structure in to a 2d list
f = open("./txtfile/dssp_protein")

list = f.readlines()
list2 = []
for i in range(0, len(list), 3):
    temp = []
    temp.append(list[i + 1].strip())
    temp.append(list[i + 2].strip())
    temp.append(list[i].strip())
    list2.append(temp)
コード例 #7
0
from pre_process import *
from numpy import log
import matplotlib.pyplot as plt

dssp = pre_process("./txtfile/dssp_info.txt", "./txtfile/dssp_protein.txt")
stride = pre_process("./txtfile/stride_info.txt",
                     "./txtfile/stride_protein.txt")

fsr, fs, fr = fSR(dssp)
#print(fs)
#print(fr)
total = fs['C'] + fs['E'] + fs['C']

# begin gor3 algorithm


def gor2(dssp, resultfile):
    predict2 = []
    for i in range(len(dssp)):

        helix = log(fsr['H'][dssp[i][3]] /
                    (fsr['C'][dssp[i][3]] + fsr['E'][dssp[i][3]])) + log(
                        (fs['E'] + fs['C']) / fs['H'])
        for j in range(-8, 9):
            if j != 0:
                if i + j > 0 and i + j < len(dssp) - 1:
                    # here now use the gor ii
                    t = i + j
                    if dssp[t][0] == dssp[i][0] and dssp[t][1] == dssp[i][1]:
                        helix += log(fsr['H'][dssp[t][3]] /
                                     (fsr['C'][dssp[t][3]] +
コード例 #8
0
print(len(train_x))

# Test Data

for file in test_dir:
    mid_file = MidiFile('python_test_set/' + file)
    file_name = mid_file.filename.split('/')[-1]

    vector = []

    for i, track in enumerate(mid_file.tracks):
        for msg in track:
            if hasattr(msg, 'note'):
                if msg.velocity != 0:
                    vector.append(msg.note)
    pre_pro_vector = pre_process(vector, model)
    test_x[file_name] = pre_pro_vector
    #print(test_x[file_name])

print(len(test_x))

# Creating Classifier
classifier = svm.SVC()

# Training Our Model
# Before Training our Model we should fix our data cause its not on the correct format and order

train_x_data = []
train_y_data = []

train_x_keys = list(train_x.keys())