Ejemplo n.º 1
0
    def __init__(self,
                 num_of_homo_feats=10,
                 max_qry_length=1794,
                 max_doc_length=2907,
                 query_path=None,
                 document_path=None,
                 corpus="TDT2"):
        res_pos = True
        self.num_vocab = 51253
        self.max_qry_length = max_qry_length
        self.max_doc_length = max_doc_length
        self.num_of_homo_feats = num_of_homo_feats
        if query_path == None:
            query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if document_path == None:
            document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"

        # relevance set
        self.hmm_training_set = ProcDoc.readRELdict()

        # read document, reserve position
        doc = ProcDoc.readFile(document_path)
        self.doc = ProcDoc.docPreproc(doc, res_pos, 200)

        # read query, reserve position
        qry = ProcDoc.readFile(query_path)
        self.qry = ProcDoc.qryPreproc(qry, self.hmm_training_set, res_pos, 200)

        # generate h featrues
        self.homo_feats = self.__genFeature(num_of_homo_feats)
Ejemplo n.º 2
0
    def __init__(self,
                 qry_path=None,
                 rel_path=None,
                 isTraining=True,
                 doc_path=None):
        # default training step
        if qry_path == None:
            qry_path = "../Corpus/TDT2/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
        if doc_path == None:
            doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
        if rel_path == None:
            rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain"
        self.vocab_size = 51253
        # relevance set
        self.rel_set = ProcDoc.readRELdict(rel_path, isTraining)
        self.evaluate_model = EvaluateModel(rel_path, isTraining)

        # read documents
        doc = ProcDoc.readFile(doc_path)
        self.doc = ProcDoc.docPreproc(doc)
        self.doc_len = Statistical.compLenAcc(self.doc)

        # read queries
        qry = ProcDoc.readFile(qry_path)
        self.qry_tf = ProcDoc.qryPreproc(qry, self.rel_set)
        self.qry_len = Statistical.compLenAcc(self.qry_tf)
        [self.qry, self.doc] = Statistical.TFIDF(self.qry_tf, self.doc,
                                                 self.qry_len, self.doc_len)

        # dict to numpy
        self.qry_tf, self.qry_tf_IDs = self.__dict2np(self.qry_tf)
        self.qry, self.qry_IDs = self.__dict2np(self.qry, self.qry_tf_IDs)
        self.doc, self.doc_IDs = self.__dict2np(self.doc)

        # precompute len(document)
        self.doc = Statistical.l2Normalize(self.doc)
Ejemplo n.º 3
0
type_feat = "sparse"  # or embeddings
query_path = None
document_path = None
QDrel_file_path = None

corpus = "TDT2"

# qry and doc
if query_path == None:
    query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW"
if document_path == None:
    document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW"
if QDrel_file_path == None:
    QDrel_file_path = "../Significant-Words-Language-Models/train-qry-results-0.675969697596.txt"
# relevancy set
hmm_training_set = ProcDoc.readRELdict()

# read document, reserve position
doc = ProcDoc.readFile(document_path)
doc = ProcDoc.docPreproc(doc, RES_POS)

# read query, reserve position
qry = ProcDoc.readFile(query_path)
qry = ProcDoc.qryPreproc(qry, hmm_training_set, RES_POS)
QDrel = RelPrep.readQDRel(QDrel_file_path)

print len(qry), len(doc)
print len(QDrel)
NRMprep.getTrainAndValidation(qry, doc, QDrel, NUM_VOCAB, type_rank, type_feat)
# (pointwise or pairwise) and (sparse or embeddings)
# prepare data and label