コード例 #1
0
    def __init__(self,
                 num_of_homo_feats=10,
                 max_qry_length=1794,
                 max_doc_length=2907,
                 query_path=None,
                 document_path=None,
                 corpus="TDT2"):
        res_pos = True
        str2int = True
        self.num_vocab = 51253
        self.max_qry_length = max_qry_length
        self.max_doc_length = max_doc_length
        self.num_of_homo_feats = num_of_homo_feats
        if query_path == None:
            query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if document_path == None:
            document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
        # read document, reserve position
        doc = ProcDoc.read_file(document_path)
        self.doc = ProcDoc.doc_preprocess(doc, res_pos, str2int)

        # read query, reserve position
        qry = ProcDoc.read_file(query_path)
        self.qry = ProcDoc.query_preprocess(qry, res_pos, str2int)

        # HMMTrainingSet
        self.hmm_training_set = ProcDoc.read_relevance_dict()
        self.homo_feats = self.__genFeature(num_of_homo_feats)
コード例 #2
0
#document_path = "../Corpus/Spoken_Doc"
document_path = "../Corpus/SPLIT_DOC_WDID_NEW"	
query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW"


# read document
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)

# HMMTraingSet
HMMTraingSetDict = ProcDoc.read_relevance_dict()
query_relevance = {}

query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
query_wordcount = {}

for q, q_content in query.items():
	query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_unigram = ProcDoc.unigram(query_wordcount)


# create outside query model
query_model = []
q_list = query_unigram.keys()
for q, w_uni in query_unigram.items():
	if q in HMMTraingSetDict:
		vocabulary = np.zeros(51253)
		for w, uni in w_uni.items():
コード例 #3
0
# general model
collection = {}
collection_total_similarity = {}
for key, value in doc_wordcount.items():
    for word, count in value.items():
        if word in collection:
            collection[word] += count
        else:
            collection[word] = count

collection_word_sum = 1.0 * ProcDoc.word_sum(collection)
general_model = {k: v / collection_word_sum for k, v in collection.items()}

# query model
query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
query_wordcount = {}

for q, q_content in query.items():
    query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_unigram = ProcDoc.unigram(dict(query_wordcount))
query_model = query_unigram
Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True)

# remove template file
for rm_file in remove_list:
    if os.path.isfile("model/" + rm_file):
        os.remove("model/" + rm_file)

# Embedded Query Expansion
コード例 #4
0
 LAYERS = 1
 TRAINING_SIZE = 800
 EPOCHS = 200
 BATCH_SIZE = 50
 # All the numbers, plus sign and space for padding.
 VOCAB_SIZE = 51253
 corpus = "TDT2"
 ENCODE_LENGTH = len('{0:016b}'.format(VOCAB_SIZE))
 qry_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
 res_pos = True
 str2int = True
 #
 chars = list(range(VOCAB_SIZE + 1))
 ctable = CharacterTable(chars, ENCODE_LENGTH)
 qry = ProcDoc.read_file(qry_path)
 qry = ProcDoc.query_preprocess(qry, res_pos, str2int)
 TRAINING_SIZE = len(qry.keys())
 questions = []
 expected = []
 count = 0
 print('Generating data...')
 for q_name, q_cont in qry.items():
     #a = ' '.join(str(np.random.choice(chars)) for i in range(np.random.randint(1, MAXLEN)))
     # Pad the data with spaces such that it is always MAXLEN.
     q = [str(e + 1) for e in q_cont]
     for x in xrange(MAXLEN - len(q)):
         q.insert(0, '0')
     #print(q)
     questions.append(q)
     count += 1
     print(str(count) + "/" + str(TRAINING_SIZE), end='\r')