def extract_gram_features(field, train_ids, test_ids, gram_type, unigram=False): """ Read features file to fetch terms from ElasticSearch, extract corresponding values from ElasticSearch and store it in feature_matrix :param field: name of the field containing feature grams :param train_ids: ids of documents to train :param test_ids: ids of documents to test :param gram_type: type of gram (_unigram, _skipgram, etc) :param unigram: True if term to fetch has to be only unigram """ if not os.path.isfile(DATA_PATH + gram_type): es_util = ElasticSearchUtility() grams = es_util.get_all_grams(TREC_SPAM_CORPUS_INDEX, TREC_SPAM_CORPUS_TYPE, field, unigram) list_to_file(grams, DATA_PATH + gram_type) all_grams_dict = read_word_into_dict(DATA_PATH + gram_type) create_feature_matrix(all_grams_dict, field, train_ids, FM_TRAIN_FILE + gram_type) create_feature_matrix(all_grams_dict, field, test_ids, FM_TEST_FILE + gram_type)
def write_data_ids(self, train_data_id_file, test_data_id_file): """ write testing data ids in a separate file and training data ids in a separate file :param train_data_id_file: file to write training data ids in :param test_data_id_file: file to write testing data ids in """ train_write_list = [] test_write_list = [] self.read_features() query_id_list = self.feature_dict_list[0].keys() test_query_list = random.sample(query_id_list, TEST_SIZE) for query in self.qrel_data: if query in test_query_list: query_string = query + ' ' for document in self.qrel_data[query]: test_write_list.append(query_string + document) elif query in query_id_list: query_string = query + ' ' for document in self.qrel_data[query]: train_write_list.append(query_string + document) list_to_file(train_write_list, train_data_id_file) list_to_file(test_write_list, test_data_id_file)
def create_features_matrix(self, train_data_ids, test_data_ids): """ create features matrix from feature files in order of data_id_list and store the matrix in a file :param data_id_list: :return: """ train_matrix_list = self.get_feature_matrix_list(train_data_ids) test_matrix_list = self.get_feature_matrix_list(test_data_ids) list_to_file(train_matrix_list, TRAIN_DATA_FILE) list_to_file(test_matrix_list, TEST_DATA_FILE)
def store_top_n_results(self, n, out_file): """ Store top n results for given n into given out_file from self.all_results where results are fetched file names from ElasticSearch using their doc ids :param n: number of top results to store :param out_file: output file for storing results """ print 'Getting top ', n, ' results...' top_n = sorted(self.all_results_dict.items(), key=lambda x: x[1], reverse=False)[:n] es_util = ElasticSearchUtility() top_n_ids = [x[0] for x in top_n] top_n_files = es_util.get_field_values_for_docs( TREC_SPAM_CORPUS_INDEX, TREC_SPAM_CORPUS_TYPE, "file_name", top_n_ids) list_to_file(top_n_files, out_file)
def extract_features(feature_type, suffix): """ extract features based on feature type from ElasticSearch and store it in a feature matrix file :param feature_type: type of features (either "manual", "shingle" or "skipgrams" :param suffix: suffix for file names """ if feature_type == "manual": # get train and test ids train_ids, test_ids = get_ids() # store train and test ids list_to_file(train_ids, TRAIN_ID_FILE + suffix) list_to_file(test_ids, TEST_ID_FILE + suffix) # extract features for ids extract_manual_features("body_shingles", train_ids, test_ids, suffix) elif feature_type == "shingle": # get train and test ids train_ids, test_ids = get_ids() # store train and test ids list_to_file(train_ids, TRAIN_ID_FILE + suffix) list_to_file(test_ids, TEST_ID_FILE + suffix) # extract features for ids extract_gram_features("body_shingles", train_ids, test_ids, suffix) elif feature_type == "skipgram": # get train and test ids train_ids, test_ids = get_ids() # store train and test ids list_to_file(train_ids, TRAIN_ID_FILE + suffix) list_to_file(test_ids, TEST_ID_FILE + suffix) # extract features for ids extract_gram_features("body_skipgrams", train_ids, test_ids, suffix) else: print 'Invalid feature_type for function extract_features.' return None
if query_id in test_dict: test_dict[query_id].update({doc_no: score}) else: test_dict[query_id] = {doc_no: score} test_write_list = [] for query in test_dict: sorted_values = sorted(test_dict[query].iteritems(), key=lambda x: x[1], reverse=True)[:1000] i = 1 for element in sorted_values: doc_no = element[0] score = element[1] test_write_list.append(query + ' ' + 'Q ' + doc_no + ' ' + str(i) + ' ' + str(score) + ' Tirth') i += 1 list_to_file(test_write_list, TEST_RESULTS_FILE) # Training Performance: # run ML model on training matrix # rank and format results for IR(trec) evaluation train_data_ids = read_list_into_list(TRAIN_DATA_ID_FILE) # [[query_id, doc_no]...] train_output = read_word_into_list(TRAIN_OUTPUT_FILE) # [score, ...] train_length = len(train_data_ids) train_dict = dict() for i in range(0, train_length): query_id = train_data_ids[i][0] doc_no = train_data_ids[i][1] score = train_output[i] if query_id in train_dict: train_dict[query_id].update({doc_no: score})