Ejemplo n.º 1
0
    def evaluate(self, collection):
        if not self.does_index_exist(collection):
            raise Exception('The index {} does not exist!'.format(collection))
        command = ''
        if collection == 'core18':
            command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index core18 ' + \
                      '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.core18.txt ' + \
                      '-output runs/run.solr.core18.bm25.topics.core18.txt'
        elif collection == 'robust04':
            command = 'sh target/appassembler/bin/SearchSolr -topicreader Trec -solr.index robust04 ' + \
                      '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \
                      '-output runs/run.solr.robust04.bm25.topics.robust04.txt'
        elif collection == 'msmarco-passage':
            command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvString -solr.index msmarco-passage ' + \
                      '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \
                      '-output runs/run.solr.msmarco-passage.txt'
        elif collection == 'msmarco-doc':
            command = 'sh target/appassembler/bin/SearchSolr -topicreader TsvInt -solr.index msmarco-doc ' + \
                      '-solr.zkUrl localhost:9983 -topics src/main/resources/topics-and-qrels/topics.msmarco-doc.dev.txt ' + \
                      '-output runs/run.solr.msmarco-doc.txt '
        else:
            raise Exception('Unknown collection: {}'.format(collection))

        logger.info('Retrieval command: ' + command)
        output = regression_utils.run_shell_command(command, logger, echo=True)
        logger.info('Retrieval complete!')

        if collection == 'core18':
            command = 'eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \
                      'src/main/resources/topics-and-qrels/qrels.core18.txt runs/run.solr.core18.bm25.topics.core18.txt'
        elif collection == 'robust04':
            command = 'eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \
                      'src/main/resources/topics-and-qrels/qrels.robust04.txt runs/run.solr.robust04.bm25.topics.robust04.txt'
        elif collection == 'msmarco-passage':
            command = 'eval/trec_eval.9.0.4/trec_eval  -c -mrecall.1000 -mmap ' + \
                      'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.solr.msmarco-passage.txt'
        elif collection == 'msmarco-doc':
            command = 'eval/trec_eval.9.0.4/trec_eval  -c -mrecall.1000 -mmap ' + \
                      'src/main/resources/topics-and-qrels/qrels.msmarco-doc.dev.txt runs/run.solr.msmarco-doc.txt'
        else:
            raise Exception('Unknown collection: {}'.format(collection))

        logger.info('Evaluation command: ' + command)
        output = regression_utils.run_shell_command(command,
                                                    logger,
                                                    capture=True)
        ap = float(output[0].split('\t')[2])

        expected = 0
        if collection == 'core18': expected = 0.2495
        elif collection == 'robust04': expected = 0.2531
        elif collection == 'msmarco-passage': expected = 0.1926
        elif collection == 'msmarco-doc': expected = 0.2310
        else: raise Exception('Unknown collection: {}'.format(collection))

        if math.isclose(ap, expected):
            logger.info('[SUCESS] {} MAP verified as expected!'.format(ap))
        else:
            logger.info('[FAILED] {} MAP, expected {} MAP!'.format(
                ap, expected))
Ejemplo n.º 2
0
 def delete_index(self, collection):
     # Make sure the index exists:
     if self.does_index_exist(collection):
         command = 'solrini/bin/solr delete -c {}'.format(collection)
         logger.info('Deleting index {} command: {}'.format(
             collection, command))
         regression_utils.run_shell_command(command, logger, echo=True)
         return not self.does_index_exist(collection)
     else:
         raise Exception('The index {} does not exist!'.format(collection))
Ejemplo n.º 3
0
 def upload_configs():
     os.chdir('src/main/resources/solr')
     command = 'rm -rf anserini/conf/lang anserini-twitter/conf/lang'
     logger.info('Deleting existed configs command: ' + command)
     regression_utils.run_shell_command(command, logger, echo=True)
     command = './solr.sh ../../../../solrini localhost:9983'
     logger.info('Uploading configs command: ' + command)
     regression_utils.run_shell_command(command, logger, echo=True)
     os.chdir('../../../..')
     logger.info('Uploading complete!')
Ejemplo n.º 4
0
 def create_index(self, collection):
     # Make sure the index does not exist:
     if not self.does_index_exist(collection):
         # Re-upload configsets to Solr's internal Zookeeper
         self.upload_configs()
         command = 'solrini/bin/solr create -n anserini -c {}'.format(
             collection)
         logger.info('Creating index {} command: {}'.format(
             collection, command))
         regression_utils.run_shell_command(command, logger, echo=True)
         return self.does_index_exist(collection)
     else:
         raise Exception('The index {} already exists!'.format(collection))
Ejemplo n.º 5
0
 def insert_docs(self, collection, path):
     logger.info('Inserting documents from {} into {}... '.format(
         path, collection))
     if not os.path.exists(args.input):
         raise Exception('{} does not exist!'.format(args.input))
     if not self.does_index_exist(collection):
         raise Exception('The index {} does not exist!'.format(collection))
     # TODO: abstract this into an external config instead of hard-coded.
     if collection == 'robust04':
         command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \
                   '-generator DefaultLuceneDocumentGenerator -es -es.index robust04 -threads 16 -input ' + \
                   path + ' -storePositions -storeDocvectors -storeRaw'
     elif collection == 'msmarco-passage':
         command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \
                   '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-passage -threads 9 -input ' + \
                   path + ' -storePositions -storeDocvectors -storeRaw'
     elif collection == 'core18':
         command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \
                   '-generator WashingtonPostGenerator -es -es.index core18 -threads 8 -input ' + \
                   path + ' -storePositions -storeDocvectors -storeContents'
     elif collection == 'msmarco-doc':
         command = 'sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection ' + \
                   '-generator DefaultLuceneDocumentGenerator -es -es.index msmarco-doc -threads 1 -input ' + \
                   path + ' -storePositions -storeDocvectors -storeRaw'
     else:
         raise Exception('Unknown collection: {}'.format(collection))
     logger.info('Running indexing command: ' + command)
     return regression_utils.run_shell_command(command, logger, echo=True)
Ejemplo n.º 6
0
 def insert_docs(self, collection, path):
     logger.info('Inserting documents from {} into {}... '.format(path, collection))
     if not os.path.exists(args.input):
         raise Exception('{} does not exist!'.format(args.input))
     if not self.does_index_exist(collection):
         raise Exception('The index {} does not exist!'.format(collection))
     command = ''
     if collection == 'core18':
         command = 'sh target/appassembler/bin/IndexCollection -collection WashingtonPostCollection ' + \
                   '-generator WashingtonPostGenerator -solr -solr.index core18 -solr.zkUrl localhost:9983 ' + \
                   '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeTransformedDocs'
     elif collection == 'robust04':
         command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \
                   '-generator JsoupGenerator -solr -solr.index robust04 -solr.zkUrl localhost:9983 ' + \
                   '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRawDocs'
     elif collection == 'msmarco-passage':
         command = 'sh target/appassembler/bin/IndexCollection -collection JsonCollection ' + \
                   '-generator JsoupGenerator -solr -solr.index msmarco-passage -solr.zkUrl localhost:9983 ' + \
                   '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRawDocs'
     elif collection == 'msmarco-doc':
         command = 'sh target/appassembler/bin/IndexCollection -collection TrecCollection ' + \
                   '-generator LuceneDocumentGenerator -solr -solr.index msmarco-doc -solr.zkUrl localhost:9983 ' + \
                   '-threads 8 -input ' + path + ' -storePositions -storeDocvectors -storeRawDocs'
     else:
         raise Exception('Unknown collection: {}'.format(collection))
     logger.info('Running indexing command: ' + command)
     return regression_utils.run_shell_command(command, logger, echo=True)
Ejemplo n.º 7
0
    def evaluate(self, collection):
        if not self.does_index_exist(collection):
            raise Exception('The index {} does not exist!'.format(collection))
        # TODO: abstract this into an external config instead of hard-coded.
        command = ''
        if collection == 'robust04':
            command = 'sh target/appassembler/bin/SearchElastic -topicreader Trec -es.index robust04 ' + \
                      '-topics src/main/resources/topics-and-qrels/topics.robust04.txt ' + \
                      '-output run.es.robust04.bm25.topics.robust04.txt'
        elif collection == 'msmarco-passage':
            command = 'sh target/appassembler/bin/SearchElastic -topicreader TsvString -es.index msmarco-passage ' + \
                      '-topics src/main/resources/topics-and-qrels/topics.msmarco-passage.dev-subset.txt ' + \
                      '-output run.es.msmacro-passage.txt'
        else:
            raise Exception('Unknown collection: {}'.format(collection))

        logger.info('Retrieval command: ' + command)
        output = regression_utils.run_shell_command(command, logger, echo=True)
        logger.info('Retrieval complete!')

        if collection == 'robust04':
            command = 'eval/trec_eval.9.0.4/trec_eval -m map -m P.30 ' + \
                      'src/main/resources/topics-and-qrels/qrels.robust04.txt run.es.robust04.bm25.topics.robust04.txt'
        elif collection == 'msmarco-passage':
            command = 'eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap ' + \
                      'src/main/resources/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt run.es.msmacro-passage.txt'
        else:
            raise Exception('Unknown collection: {}'.format(collection))

        logger.info('Evaluation command: ' + command)
        output = regression_utils.run_shell_command(command,
                                                    logger,
                                                    capture=True)
        ap = float(output[0].split('\t')[2])

        expected = 0
        if collection == 'robust04': expected = 0.2531
        elif collection == 'msmarco-passage': expected = 0.1956
        else: raise Exception('Unknown collection: {}'.format(collection))

        if math.isclose(ap, expected):
            logger.info('[SUCESS] {} MAP verified as expected!'.format(ap))
        else:
            logger.info('[FAILED] {} MAP, expected {} MAP!'.format(
                ap, expected))
Ejemplo n.º 8
0
             f'{st_command} -index {mb11_index}                        -topics {mb11_topics}                     -output'], \
           [ f'{sc_command} -index {mb11_index} -topicreader Microblog -topics {mb11_topics} -bm25 -rm3 -searchtweets -output', \
             f'{st_command} -index {mb11_index}                        -topics {mb11_topics}       -rm3               -output'], \
         ]

if __name__ == '__main__':
    group_cnt = 0
    for group in groups:
        print(f'# Verifying Group {group_cnt}')
        entry_cnt = 0
        group_runs = []
        for entry in group:
            run_file = f'runs/run.ss_verify.g{group_cnt}.e{entry_cnt}.txt'
            cmd = f'{entry} {run_file}'
            print(f'Running: {cmd}')
            regression_utils.run_shell_command(cmd, logger, echo=False)

            # Load in the run file.
            with open(run_file, 'r') as file:
                group_runs.append(file.read().replace('\n', ''))

            entry_cnt += 1

        # Check that all run files are identical.
        for i in range(len(group_runs)):
            if group_runs[0] != group_runs[i]:
                raise ValueError(
                    f'Group {group_cnt}: Results are not identical!')

        print(f'# Group {group_cnt}: Results identical')
        group_cnt += 1