コード例 #1
0
    def __init__(self,
                 num_of_homo_feats=10,
                 max_qry_length=1794,
                 max_doc_length=2907,
                 query_path=None,
                 document_path=None,
                 corpus="TDT2"):
        res_pos = True
        str2int = True
        self.num_vocab = 51253
        self.max_qry_length = max_qry_length
        self.max_doc_length = max_doc_length
        self.num_of_homo_feats = num_of_homo_feats
        if query_path == None:
            query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if document_path == None:
            document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
        # read document, reserve position
        doc = ProcDoc.read_file(document_path)
        self.doc = ProcDoc.doc_preprocess(doc, res_pos, str2int)

        # read query, reserve position
        qry = ProcDoc.read_file(query_path)
        self.qry = ProcDoc.query_preprocess(qry, res_pos, str2int)

        # HMMTrainingSet
        self.hmm_training_set = ProcDoc.read_relevance_dict()
        self.homo_feats = self.__genFeature(num_of_homo_feats)
コード例 #2
0
ファイル: Main.py プロジェクト: zuacubd/Information-retrieval
def run():
	INIT_PROBABILITY = 1.0 / 60
	topic_word_prob_dict = ProcDoc.read_clusters()									# read cluster P(W|T), {T: {W:Prob}}
	doc_topic_prob_dict = defaultdict(dict)														# P(T|D),{D:{T:Prob}} 
	doc_word_topic_prob_dict = defaultdict(dict)									# P(T| w, D), {D: {word:{T:prob}}}
	doc_wc_dict = ProcDoc.read_doc_dict()  											# read document (Doc No.,Doc content)  
	doc_wc_dict = ProcDoc.doc_preprocess(doc_wc_dict)
	# calculate word of the background
	# convert (Doc No.,Doc content) to (Doc_No, {word, count})
	for docName, content in doc_wc_dict.items():
		temp_dict = ProcDoc.word_count(content, {})
		doc_wc_dict[docName] = temp_dict

	# initialize P(T|D)
	print "Initialize P(T|D)"
	for docName, wordCount in doc_wc_dict.items():
		topic_prob = {}
		for topic, wordProb in topic_word_prob_dict.items():
			doc_topic_prob_dict[docName][topic] = INIT_PROBABILITY
			
	'''
	print "Initialize P(T| w, D)"
	for docName, wordCount in doc_wc_dict.items():	
		word_list = {}
		for word, frequency in wordCount.items():	
			topic_prob = {}
			for topic, wordProb in topic_word_prob_dict.items():
				topic_prob[topic] = 0.0
			word_list[word] = topic_prob
		doc_word_topic_prob_dict[docName] = word_list
	'''
	print "start PLSA"
	[topic_word_prob_dict, doc_topic_prob_dict] = PLSA.Probability_LSA(doc_wc_dict, doc_topic_prob_dict, topic_word_prob_dict, doc_word_topic_prob_dict)
	print "end PLSA"
	
	p_plsa = {}			# PLSA P(W|D) {D: {W : Prob}}
	for doc, topic_prob_list in doc_topic_prob_dict.items():
		p_plsa_word = {}
		for topic, doc_prob in topic_prob_list.items():
			for word, word_prob in topic_word_prob_dict[topic].items():
				print word, word_prob
				if word in p_plsa_word:
					p_plsa_word[word] += word_prob * doc_prob
				else:
					p_plsa_word[word] = word_prob * doc_prob
			
		p_plsa[doc] = p_plsa_word

	return p_plsa
コード例 #3
0
import os

data = {}				# content of document (doc, content)
background_model = {}	# word count of 2265 document (word, number of words)
general_model = {}
query = {}				# query
vocabulary = np.zeros(51253)

#document_path = "../Corpus/Spoken_Doc"
document_path = "../Corpus/SPLIT_DOC_WDID_NEW"	
query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW"


# read document
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)

# HMMTraingSet
HMMTraingSetDict = ProcDoc.read_relevance_dict()
query_relevance = {}

query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
query_wordcount = {}

for q, q_content in query.items():
	query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_unigram = ProcDoc.unigram(query_wordcount)

コード例 #4
0
with open(model_path + "doc_list.pkl", "rb") as f:
    doc_list = Pickle.load(f)
with open(model_path + "query_list.pkl", "rb") as f:
    qry_list = Pickle.load(f)
with open(model_path + "test_query_list.pkl", "rb") as f:
    tstQry_list = Pickle.load(f)

wordModel = word2vec_model.word2vec_model()
wordVec = wordModel.getWord2Vec()
vocab_length = wordModel.vocabulary_length
print vocab_length

# document
doc = ProcDoc.read_file(document_path)
doc = ProcDoc.doc_preprocess(doc)
#[docTmpList, docEmbList] = content2Emb(doc, wordVec, 100)
#doc_emb = rePermute(docTmpList, docEmbList, doc_list)
#doc_emb = content2List(doc, doc_list)
#doc_emb = np.asarray(doc_emb)
#print doc_emb.shape
#np.save(model_path + "doc_id_fix_pad.npy", doc_emb)

# train query
query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
#[qryTmpList, qryEmbList] = content2Emb(query, wordVec, 100)
#qry_emb = rePermute(qryTmpList, qryEmbList, qry_list)
qry_emb = content2List(query, qry_list)
qry_emb = np.asarray(qry_emb)
print qry_emb.shape
コード例 #5
0
# -*- coding: utf-8 -*-
import ProcDoc
from gensim import corpora, models, matutils
from sklearn.cluster import KMeans
documents = ProcDoc.read_doc()
documents = ProcDoc.doc_preprocess(documents)

# remove common words and tokenize
texts = [[word for word in document.lower().split()] for document in documents]

texts = [[token for token in text] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

print "TFIDF:"
corpus_tfidf = matutils.corpus2csc(corpus_tfidf).transpose()
print corpus_tfidf
print "__________________________________________"

num_of_clusters = 64
kmeans = KMeans(n_clusters=num_of_clusters)
doc_cluster = kmeans.fit_predict(corpus_tfidf)
clusters = [[] for i in range(num_of_clusters)]

doc_index = 0
for cluster in doc_cluster:
    clusters[cluster].append(doc_index)
    doc_index += 1
コード例 #6
0
    # check whether a file exists before read
    if os.path.isfile(query_item_path):
        with open(query_item_path, 'r') as f:
            # read content of query documant (doc, content)
            query[query_item] = f.read()

# count background_word
for key, value in data.items():
    background_word = ProcDoc.word_count(value, dict(background_word))

for key, value in query.items():
    background_word = ProcDoc.word_count(value, dict(background_word))

background_word_sum = ProcDoc.word_sum(background_word)
# doc preprocess
data = ProcDoc.doc_preprocess(data)

# query preprocess
query = ProcDoc.query_preprocess(query)
query_word_count = {}
for q, q_content in query.items():
    query_word_count[q] = ProcDoc.word_count(q_content, {})

# query process
assessment = readAssessment.get_assessment()
lambda_test = {0: 0}
interval = 0.1
isBreak = "run"
while isBreak != "exit":
    for my_lambda in np.arange(0, 1, interval):
        if my_lambda in lambda_test: continue
コード例 #7
0
			# read content of query documant (doc, content)
            query[query_item] = f.read()
			
# count background_word
for key, value in data.items():
	background_word = ProcDoc.word_count(value, dict(background_word))

for key, value in query.items():
	background_word = ProcDoc.word_count(value, dict(background_word))
	
background_word_sum = ProcDoc.word_sum(background_word)
#background_word = sorted(background_word.items(), key=lambda x: x[1], reverse=True)
#{k: v for k, v in sorted(background_word.items(), key=lambda item: item[1])}
#print(background_word)
# doc preprocess
[data_tf, data] = ProcDoc.doc_preprocess(data)

# query preprocess
query = ProcDoc.query_preprocess(query)
query_word_count = {}
for q, q_content in query.items():
	query_word_count[q] = ProcDoc.word_count(q_content, {})	

feedback_model = []
# query process
assessment = readAssessment.get_assessment()
lambda_test = {0: 0}
interval	= 0.1
isBreak = "run"
while isBreak != "exit":
	isWrite = True