Python ProcDoc.doc_preprocess Examples

Programming Language: Python

Class/Type: ProcDoc

Method/Function: doc_preprocess

Examples at hotexamples.com: 7

Python ProcDoc.doc_preprocess - 7 examples found. These are the top rated real world Python examples of ProcDoc.doc_preprocess extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

doc_preprocess(7)

docPreproc(7)

readFile(6)

qryPreproc(6)

read_background_dict(5)

query_preprocess(4)

dict2np(3)

readRELdict(3)

dict2npSparse(2)

docFreq(2)

dict2npDense(2)

modeling(2)

compute_average_doc_length(1)

inverted_word_doc(1)

readBGnp(1)

confPreproc(1)

compute_TFIDF(1)

Example #1

Show file

File: Preprocess.py Project: DL-DeepLearning/Information-retrieval

    def __init__(self,
                 num_of_homo_feats=10,
                 max_qry_length=1794,
                 max_doc_length=2907,
                 query_path=None,
                 document_path=None,
                 corpus="TDT2"):
        res_pos = True
        str2int = True
        self.num_vocab = 51253
        self.max_qry_length = max_qry_length
        self.max_doc_length = max_doc_length
        self.num_of_homo_feats = num_of_homo_feats
        if query_path == None:
            query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if document_path == None:
            document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
        # read document, reserve position
        doc = ProcDoc.read_file(document_path)
        self.doc = ProcDoc.doc_preprocess(doc, res_pos, str2int)

        # read query, reserve position
        qry = ProcDoc.read_file(query_path)
        self.qry = ProcDoc.query_preprocess(qry, res_pos, str2int)

        # HMMTrainingSet
        self.hmm_training_set = ProcDoc.read_relevance_dict()
        self.homo_feats = self.__genFeature(num_of_homo_feats)

Example #2

Show file

File: Main.py Project: zuacubd/Information-retrieval

def run():
	INIT_PROBABILITY = 1.0 / 60
	topic_word_prob_dict = ProcDoc.read_clusters()									# read cluster P(W|T), {T: {W:Prob}}
	doc_topic_prob_dict = defaultdict(dict)														# P(T|D),{D:{T:Prob}} 
	doc_word_topic_prob_dict = defaultdict(dict)									# P(T| w, D), {D: {word:{T:prob}}}
	doc_wc_dict = ProcDoc.read_doc_dict()  											# read document (Doc No.,Doc content)  
	doc_wc_dict = ProcDoc.doc_preprocess(doc_wc_dict)
	# calculate word of the background
	# convert (Doc No.,Doc content) to (Doc_No, {word, count})
	for docName, content in doc_wc_dict.items():
		temp_dict = ProcDoc.word_count(content, {})
		doc_wc_dict[docName] = temp_dict

	# initialize P(T|D)
	print "Initialize P(T|D)"
	for docName, wordCount in doc_wc_dict.items():
		topic_prob = {}
		for topic, wordProb in topic_word_prob_dict.items():
			doc_topic_prob_dict[docName][topic] = INIT_PROBABILITY
			
	'''
	print "Initialize P(T| w, D)"
	for docName, wordCount in doc_wc_dict.items():	
		word_list = {}
		for word, frequency in wordCount.items():	
			topic_prob = {}
			for topic, wordProb in topic_word_prob_dict.items():
				topic_prob[topic] = 0.0
			word_list[word] = topic_prob
		doc_word_topic_prob_dict[docName] = word_list
	'''
	print "start PLSA"
	[topic_word_prob_dict, doc_topic_prob_dict] = PLSA.Probability_LSA(doc_wc_dict, doc_topic_prob_dict, topic_word_prob_dict, doc_word_topic_prob_dict)
	print "end PLSA"
	
	p_plsa = {}			# PLSA P(W|D) {D: {W : Prob}}
	for doc, topic_prob_list in doc_topic_prob_dict.items():
		p_plsa_word = {}
		for topic, doc_prob in topic_prob_list.items():
			for word, word_prob in topic_word_prob_dict[topic].items():
				print word, word_prob
				if word in p_plsa_word:
					p_plsa_word[word] += word_prob * doc_prob
				else:
					p_plsa_word[word] = word_prob * doc_prob
			
		p_plsa[doc] = p_plsa_word

	return p_plsa

Example #3

Show file

File: preprocess.py Project: zuacubd/Information-retrieval

import os

data = {}				# content of document (doc, content)
background_model = {}	# word count of 2265 document (word, number of words)
general_model = {}
query = {}				# query
vocabulary = np.zeros(51253)

#document_path = "../Corpus/Spoken_Doc"
document_path = "../Corpus/SPLIT_DOC_WDID_NEW"	
query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW"


# read document
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)

# HMMTraingSet
HMMTraingSetDict = ProcDoc.read_relevance_dict()
query_relevance = {}

query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
query_wordcount = {}

for q, q_content in query.items():
	query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_unigram = ProcDoc.unigram(query_wordcount)

Example #4

Show file

File: Md2Emb.py Project: zuacubd/Information-retrieval

with open(model_path + "doc_list.pkl", "rb") as f:
    doc_list = Pickle.load(f)
with open(model_path + "query_list.pkl", "rb") as f:
    qry_list = Pickle.load(f)
with open(model_path + "test_query_list.pkl", "rb") as f:
    tstQry_list = Pickle.load(f)

wordModel = word2vec_model.word2vec_model()
wordVec = wordModel.getWord2Vec()
vocab_length = wordModel.vocabulary_length
print vocab_length

# document
doc = ProcDoc.read_file(document_path)
doc = ProcDoc.doc_preprocess(doc)
#[docTmpList, docEmbList] = content2Emb(doc, wordVec, 100)
#doc_emb = rePermute(docTmpList, docEmbList, doc_list)
#doc_emb = content2List(doc, doc_list)
#doc_emb = np.asarray(doc_emb)
#print doc_emb.shape
#np.save(model_path + "doc_id_fix_pad.npy", doc_emb)

# train query
query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
#[qryTmpList, qryEmbList] = content2Emb(query, wordVec, 100)
#qry_emb = rePermute(qryTmpList, qryEmbList, qry_list)
qry_emb = content2List(query, qry_list)
qry_emb = np.asarray(qry_emb)
print qry_emb.shape

Example #5

Show file

# -*- coding: utf-8 -*-
import ProcDoc
from gensim import corpora, models, matutils
from sklearn.cluster import KMeans
documents = ProcDoc.read_doc()
documents = ProcDoc.doc_preprocess(documents)

# remove common words and tokenize
texts = [[word for word in document.lower().split()] for document in documents]

texts = [[token for token in text] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

print "TFIDF:"
corpus_tfidf = matutils.corpus2csc(corpus_tfidf).transpose()
print corpus_tfidf
print "__________________________________________"

num_of_clusters = 64
kmeans = KMeans(n_clusters=num_of_clusters)
doc_cluster = kmeans.fit_predict(corpus_tfidf)
clusters = [[] for i in range(num_of_clusters)]

doc_index = 0
for cluster in doc_cluster:
    clusters[cluster].append(doc_index)
    doc_index += 1

Example #6

Show file

    # check whether a file exists before read
    if os.path.isfile(query_item_path):
        with open(query_item_path, 'r') as f:
            # read content of query documant (doc, content)
            query[query_item] = f.read()

# count background_word
for key, value in data.items():
    background_word = ProcDoc.word_count(value, dict(background_word))

for key, value in query.items():
    background_word = ProcDoc.word_count(value, dict(background_word))

background_word_sum = ProcDoc.word_sum(background_word)
# doc preprocess
data = ProcDoc.doc_preprocess(data)

# query preprocess
query = ProcDoc.query_preprocess(query)
query_word_count = {}
for q, q_content in query.items():
    query_word_count[q] = ProcDoc.word_count(q_content, {})

# query process
assessment = readAssessment.get_assessment()
lambda_test = {0: 0}
interval = 0.1
isBreak = "run"
while isBreak != "exit":
    for my_lambda in np.arange(0, 1, interval):
        if my_lambda in lambda_test: continue

Example #7

Show file

			# read content of query documant (doc, content)
            query[query_item] = f.read()
			
# count background_word
for key, value in data.items():
	background_word = ProcDoc.word_count(value, dict(background_word))

for key, value in query.items():
	background_word = ProcDoc.word_count(value, dict(background_word))
	
background_word_sum = ProcDoc.word_sum(background_word)
#background_word = sorted(background_word.items(), key=lambda x: x[1], reverse=True)
#{k: v for k, v in sorted(background_word.items(), key=lambda item: item[1])}
#print(background_word)
# doc preprocess
[data_tf, data] = ProcDoc.doc_preprocess(data)

# query preprocess
query = ProcDoc.query_preprocess(query)
query_word_count = {}
for q, q_content in query.items():
	query_word_count[q] = ProcDoc.word_count(q_content, {})	

feedback_model = []
# query process
assessment = readAssessment.get_assessment()
lambda_test = {0: 0}
interval	= 0.1
isBreak = "run"
while isBreak != "exit":
	isWrite = True