Esempio n. 1
0
def extract_gram_features(field,
                          train_ids,
                          test_ids,
                          gram_type,
                          unigram=False):
    """
    Read features file to fetch terms from ElasticSearch, extract corresponding values from ElasticSearch
    and store it in feature_matrix

    :param field: name of the field containing feature grams
    :param train_ids: ids of documents to train
    :param test_ids: ids of documents to test
    :param gram_type: type of gram (_unigram, _skipgram, etc)
    :param unigram: True if term to fetch has to be only unigram
    """

    if not os.path.isfile(DATA_PATH + gram_type):
        es_util = ElasticSearchUtility()
        grams = es_util.get_all_grams(TREC_SPAM_CORPUS_INDEX,
                                      TREC_SPAM_CORPUS_TYPE, field, unigram)
        list_to_file(grams, DATA_PATH + gram_type)

    all_grams_dict = read_word_into_dict(DATA_PATH + gram_type)

    create_feature_matrix(all_grams_dict, field, train_ids,
                          FM_TRAIN_FILE + gram_type)
    create_feature_matrix(all_grams_dict, field, test_ids,
                          FM_TEST_FILE + gram_type)
Esempio n. 2
0
    def write_data_ids(self, train_data_id_file, test_data_id_file):
        """
        write testing data ids in a separate file and training data ids
        in a separate file

        :param train_data_id_file: file to write training data ids in
        :param test_data_id_file: file to write testing data ids in
        """
        train_write_list = []
        test_write_list = []

        self.read_features()

        query_id_list = self.feature_dict_list[0].keys()
        test_query_list = random.sample(query_id_list, TEST_SIZE)

        for query in self.qrel_data:
            if query in test_query_list:
                query_string = query + ' '
                for document in self.qrel_data[query]:
                    test_write_list.append(query_string + document)
            elif query in query_id_list:
                query_string = query + ' '
                for document in self.qrel_data[query]:
                    train_write_list.append(query_string + document)

        list_to_file(train_write_list, train_data_id_file)
        list_to_file(test_write_list, test_data_id_file)
Esempio n. 3
0
    def create_features_matrix(self, train_data_ids, test_data_ids):
        """
        create features matrix from feature files in order of data_id_list
        and store the matrix in a file

        :param data_id_list:
        :return:
        """
        train_matrix_list = self.get_feature_matrix_list(train_data_ids)
        test_matrix_list = self.get_feature_matrix_list(test_data_ids)

        list_to_file(train_matrix_list, TRAIN_DATA_FILE)
        list_to_file(test_matrix_list, TEST_DATA_FILE)
    def store_top_n_results(self, n, out_file):
        """
        Store top n results for given n into given out_file from self.all_results
        where results are fetched file names from ElasticSearch using their doc ids

        :param n: number of top results to store
        :param out_file: output file for storing results
        """

        print 'Getting top ', n, ' results...'

        top_n = sorted(self.all_results_dict.items(),
                       key=lambda x: x[1],
                       reverse=False)[:n]

        es_util = ElasticSearchUtility()

        top_n_ids = [x[0] for x in top_n]

        top_n_files = es_util.get_field_values_for_docs(
            TREC_SPAM_CORPUS_INDEX, TREC_SPAM_CORPUS_TYPE, "file_name",
            top_n_ids)

        list_to_file(top_n_files, out_file)
Esempio n. 5
0
def extract_features(feature_type, suffix):
    """
    extract features based on feature type from ElasticSearch and store it in a feature matrix file

    :param feature_type: type of features (either "manual", "shingle" or "skipgrams"
    :param suffix: suffix for file names
    """

    if feature_type == "manual":
        # get train and test ids
        train_ids, test_ids = get_ids()
        # store train and test ids
        list_to_file(train_ids, TRAIN_ID_FILE + suffix)
        list_to_file(test_ids, TEST_ID_FILE + suffix)
        # extract features for ids
        extract_manual_features("body_shingles", train_ids, test_ids, suffix)
    elif feature_type == "shingle":
        # get train and test ids
        train_ids, test_ids = get_ids()
        # store train and test ids
        list_to_file(train_ids, TRAIN_ID_FILE + suffix)
        list_to_file(test_ids, TEST_ID_FILE + suffix)
        # extract features for ids
        extract_gram_features("body_shingles", train_ids, test_ids, suffix)
    elif feature_type == "skipgram":
        # get train and test ids
        train_ids, test_ids = get_ids()
        # store train and test ids
        list_to_file(train_ids, TRAIN_ID_FILE + suffix)
        list_to_file(test_ids, TEST_ID_FILE + suffix)
        # extract features for ids
        extract_gram_features("body_skipgrams", train_ids, test_ids, suffix)
    else:
        print 'Invalid feature_type for function extract_features.'
        return None
Esempio n. 6
0
        if query_id in test_dict:
            test_dict[query_id].update({doc_no: score})
        else:
            test_dict[query_id] = {doc_no: score}

    test_write_list = []
    for query in test_dict:
        sorted_values = sorted(test_dict[query].iteritems(), key=lambda x: x[1], reverse=True)[:1000]
        i = 1
        for element in sorted_values:
            doc_no = element[0]
            score = element[1]
            test_write_list.append(query + ' ' + 'Q ' + doc_no + ' ' + str(i) + ' ' + str(score) + ' Tirth')
            i += 1

    list_to_file(test_write_list, TEST_RESULTS_FILE)

    # Training Performance:
        # run ML model on training matrix
        # rank and format results for IR(trec) evaluation
    train_data_ids = read_list_into_list(TRAIN_DATA_ID_FILE) # [[query_id, doc_no]...]
    train_output = read_word_into_list(TRAIN_OUTPUT_FILE) # [score, ...]
    train_length = len(train_data_ids)

    train_dict = dict()
    for i in range(0, train_length):
        query_id = train_data_ids[i][0]
        doc_no = train_data_ids[i][1]
        score = train_output[i]
        if query_id in train_dict:
            train_dict[query_id].update({doc_no: score})