コード例 #1
0
    def start_ranker(self, isAsync, isRepeat, text_boost=1, title_boost=1000,
                     bm25_b=0.75, bm25_k1=2):
        settings = {
            "text_boost": text_boost,
            "title_boost": title_boost,
            "bm25_b": bm25_b,
            "bm25_k1": bm25_k1
        }

        def _check_crawler(event):
            event.set()
            while event.is_set():
                time.sleep(0.2)
            print "Ranker finished"

        def _check_repeat_crawler(_event_ranker, _event_stop_repeat):
            _event_stop_repeat.clear()
            _event_ranker.clear()
            proc = None
            while not _event_stop_repeat.is_set():
                if _event_ranker.is_set():
                    time.sleep(0.5)
                else:
                    _event_ranker.set()
                    if proc is not None and proc.is_alive():
                        proc.terminate()
                    proc = Ranker.Ranker(settings, _event_ranker)
                    proc.start()
            else:
                print "Ranker finished"

        if not isAsync and isRepeat:
            raise Exception("You will be in deadlock, if you use that configure (Sync and Repeat)")

        event_finish_ranker = Event()
        if isAsync:
            if isRepeat:
                repeat_crawler_process = Process(target=_check_repeat_crawler,
                                                 args=(event_finish_ranker, self.event_repeat))
                repeat_crawler_process.start()
            else:
                process = Ranker.Ranker(settings, event_finish_ranker)
                process.start()

                check_process = Process(target=_check_crawler, args=(event_finish_ranker,))
                check_process.start()
        else:
            process = Ranker.Ranker(settings, event_finish_ranker)
            process.start()

            _check_crawler(event_finish_ranker)

        print "You may continue work"
コード例 #2
0
    def match_fuzzy(self, uni_query='', idx={}):
        if not idx: return []
        segmenter = Segmenter.get_seg(id=idx['seg'])
        prefix = idx['prefix']
        list_s = segmenter(uni_query)
        list_bitset, list_docid, len_list_docid = [], [], 0
        for s in list_s:
            str_s = redis_zero.hget(prefix, s)
            if str_s:
                yhBitset = YhBitset.YhBitset()
                yhBitset.frombytes(str_s)
                list_bitset.append(yhBitset)
                logger.error('%s matched len %s' % (s, yhBitset.length()))
            else:
                logger.error('%s filtered' % s)

        bitset = YhBitset.YhBitset()
        if list_bitset:
            bitset = list_bitset[0]
            for bs in list_bitset[1:]:
                test = bitset.anditem(bs)
                if test.length() <= 10:
                    break
                bitset = test
                logger.error('test_length %s' % test.length())
        #logger.error('match_title seg %s  len %s ids %s' % ('|'.join(list_s), len(list_docid), list_docid[:3]))
        list_docid = bitset.search(200, 1)
        list_docid = ['%s' % id for id in list_docid]
        list_docid = Ranker.Ranker().getRank(name='unigram_rank',
                                             list_id=list_docid)
        logger.error('match_fuzzy %s' % list_docid)
        return list_docid[:200]
コード例 #3
0
ファイル: Xywy_Indexer.py プロジェクト: yanghaocsg/CloudSE_V1
 def full_match(self, uni_query="", idx={}):
     if not idx: return []
     prefix = idx['prefix']
     list_s = [uni_query]
     logger.error("================lists:%s" % ("|".join(list_s)))
     list_bitset, list_docid, len_list_docid = [], [], 0
     for s in list_s:
         str_s = redis_zero.hget(prefix, s)
         if str_s:
             yhBitset = YhBitset.YhBitset()
             yhBitset.frombytes(str_s)
             list_bitset.append(yhBitset)
             logger.error('%s matched len %s' % (s, yhBitset.length()))
         else:
             logger.error('%s filtered' % s)
     bitset = YhBitset.YhBitset()
     if list_bitset:
         bitset = list_bitset[0]
         for bs in list_bitset[1:]:
             test = bitset.anditem(bs)
             if test.length() <= 0:
                 break
             bitset = test
     list_docid = bitset.search(200, 1)
     list_docid = ['%s' % id for id in list_docid]
     list_docid = Ranker.Ranker().getRank(name='unigram_rank',
                                          list_id=list_docid)
     logger.error('match [%s] [%s] [%s]' % (uni_query, list_s, list_docid))
     return list_docid[:200]
コード例 #4
0
 def _check_repeat_crawler(_event_ranker, _event_stop_repeat):
     _event_stop_repeat.clear()
     _event_ranker.clear()
     proc = None
     while not _event_stop_repeat.is_set():
         if _event_ranker.is_set():
             time.sleep(0.5)
         else:
             _event_ranker.set()
             if proc is not None and proc.is_alive():
                 proc.terminate()
             proc = Ranker.Ranker(settings, _event_ranker)
             proc.start()
     else:
         print "Ranker finished"
コード例 #5
0
def search(posting_path, query, stemmer, query_source_path, list_of_language,
           list_of_city, semantic):
    Parse.set_stop_words_file(posting_path + "/stop_words.txt")
    list_save_queries = Parse.parse_queries(query_source_path, posting_path,
                                            query, stemmer, semantic)
    res = {}
    for query_post in list_save_queries:
        fileName = posting_path + "/" + query_post + ".pkl"
        file = open(fileName, "rb+")
        querie_term_dictionary = pickle.load(file)
        file.close()
        os.remove(fileName)
        query_name = query_post.replace('post', "")
        res[query_name] = Ranker.rank(posting_path, stemmer,
                                      querie_term_dictionary, list_of_language,
                                      list_of_city)
    return res
コード例 #6
0
    def match(self, uni_query='', idx={}, full=0):
        if not idx: return []
        segmenter = Segmenter.get_seg(id=idx['seg'])
        prefix = idx['prefix']
        list_s = segmenter(uni_query)
        logger.error("================lists:[%s]\t[%s]\t[%s]" %
                     (uni_query, idx['seg'], "|".join(list_s)))
        list_bitset, list_docid, len_list_docid = [], [], 0
        for s in list_s:
            str_s = redis_zero.hget(prefix, s)
            if str_s:
                yhBitset = YhBitset.YhBitset()
                yhBitset.frombytes(str_s)
                list_bitset.append(yhBitset)
                logger.error('%s matched len %s' % (s, yhBitset.length()))
            else:
                logger.error('%s filtered' % s)
        bitset = YhBitset.YhBitset()
        bitset_join_len = 0
        if list_bitset:
            bitset = list_bitset[0]
            bitset_join_len += 1
            for bs in list_bitset[1:]:
                test = bitset.anditem(bs)
                if test.length() <= 0:
                    break
                bitset = test
                bitset_join_len += 1

        if full:
            logger.error('===============match full %s %s' %
                         (bitset_join_len, len(list_s)))
            if bitset_join_len < floor(0.8 * len(list_s)):
                logger.error('===============match not enough long %s %s' %
                             (bitset_join_len, len(list_s)))
                return []

        #bitset= bitset.oritem(bitset_right)
        list_docid = bitset.search(200, 1)
        list_docid = ['%s' % id for id in list_docid]
        if list_docid:
            list_docid = Ranker.Ranker().getRank(name='unigram_rank',
                                                 list_id=list_docid)
        logger.error('match [%s] [%s] [%s]' % (uni_query, list_s, list_docid))
        return list_docid[:200]
コード例 #7
0
def eval_batch(fts,
               captioner,
               retriever,
               args,
               train_mode=False,
               optimizer=None):
    criterion = nn.TripletMarginLoss(reduction='mean',
                                     margin=args.margin).to(device)
    # generate a mapping for dev, to ensure sampling bias is reduced
    num_target = len(fts['asins'])

    batch_size = args.batch_size
    ranker = Ranker.Ranker(device)
    total_step = math.floor(num_target / batch_size)

    ranking_tracker = [0] * args.num_dialog_turns
    loss_tracker = [0] * args.num_dialog_turns

    with open('data/shuffled.{}.{}.json'.format(args.data_set, 'val')) as f:
        first_candidate_set = json.load(f)

    with torch.no_grad():
        retriever.eval()
        ranker.update_emb(fts, args.batch_size, retriever)

    if train_mode:
        retriever.train()
    else:
        retriever.eval()

    for step in tqdm.tqdm(range(total_step)):
        # sample target
        if train_mode:
            target_ids = torch.tensor([0] * args.batch_size,
                                      device=device,
                                      dtype=torch.long)
            target_ids.random_(0, num_target)
        else:
            target_ids = torch.tensor([
                i for i in range(step * batch_size, (step + 1) * batch_size)
            ]).to(device=device, dtype=torch.long)

        # sample first batch of candidates
        if train_mode:
            candidate_ids = torch.tensor([0] * args.batch_size,
                                         device=device,
                                         dtype=torch.long)
            candidate_ids.random_(0, num_target)
        else:
            candidate_ids = torch.tensor([
                first_candidate_set[i]
                for i in range(step * batch_size, (step + 1) * batch_size)
            ],
                                         device=device,
                                         dtype=torch.long)

        # target_ids.random_(0, num_target)
        target_img_ft = utils.get_image_batch(fts, target_ids)
        target_img_ft = target_img_ft.to(device)
        target_img_emb = retriever.encode_image(target_img_ft)

        target_attr = utils.get_attribute_batch(fts, target_ids)
        target_attr = target_attr.to(device)

        # clean up dialog history tracker
        retriever.init_hist()
        # history_hidden = history_hidden.expand_as(target_img_emb)
        # history_hidden = None
        loss = 0

        for d_turn in range(args.num_dialog_turns):
            # get candidate image features
            candidate_img_ft = utils.get_image_batch(fts, candidate_ids)
            candidate_img_ft = candidate_img_ft.to(device)

            candidate_attr = utils.get_attribute_batch(fts, candidate_ids)
            candidate_attr = candidate_attr.to(device)
            # generate captions from model
            with torch.no_grad():
                sentence_ids = captioner.get_caption(target_img_ft,
                                                     candidate_img_ft,
                                                     target_attr,
                                                     candidate_attr)
            sentence_ids = sentence_ids.to(device)

            candidate_img_ft = candidate_img_ft.to(device)

            history_hidden = retriever.forward(text=sentence_ids,
                                               image=candidate_img_ft,
                                               attribute=candidate_attr)

            # sample negatives, update tracker's output to
            # match targets via triplet loss
            negative_ids = torch.tensor([0] * args.batch_size,
                                        device=device,
                                        dtype=torch.long)
            negative_ids.random_(0, num_target)

            negative_img_ft = utils.get_image_batch(fts, negative_ids)
            negative_img_ft = negative_img_ft.to(device)
            negative_img_emb = retriever.encode_image(negative_img_ft)

            # accumulate loss
            loss_tmp = criterion(history_hidden, target_img_emb,
                                 negative_img_emb)
            loss += loss_tmp
            loss_tracker[d_turn] += loss_tmp.item()

            # generate new candidates, compute ranking information
            with torch.no_grad():
                candidate_ids = ranker.nearest_neighbors(history_hidden)
                ranking = ranker.compute_rank(history_hidden, target_ids)
            ranking_tracker[d_turn] += (ranking.mean().item() /
                                        (num_target * 1.0))

        # update weights
        if train_mode:
            optimizer.zero_grad()
            # loss = loss / args.num_dialog_turns
            loss.backward()
            # clip_grad_norm_(retriever.parameters(), args.clip_norm)
            optimizer.step()
            with torch.no_grad():
                retriever.eval()
                ranker.update_emb(fts, args.batch_size, retriever)
                retriever.train()

    loss = loss.item() / total_step
    for i in range(args.num_dialog_turns):
        ranking_tracker[i] /= total_step
        loss_tracker[i] /= total_step

    metrics = {
        'loss': loss,
        'score': 5 - sum(ranking_tracker),
        'loss_tracker': loss_tracker,
        'ranking_tracker': ranking_tracker
    }
    return metrics
コード例 #8
0
ファイル: Question.py プロジェクト: nwyt/Question-Answering
 def test(self, passages):
     keywords = getKeyWords(self.desc, STOP_WORDS)
     return Ranker.passage_rankings(self.desc, passages, keywords)
コード例 #9
0
ファイル: Question.py プロジェクト: nwyt/Question-Answering
 def top5(self, passages):
     keywords = getKeyWords(self.desc, STOP_WORDS)
     return zip(*Ranker.rank_passages(self.desc, passages, keywords)[:5])[1]
コード例 #10
0
def eval_batch(fts, captioner, retriever, args):
    criterion = nn.TripletMarginLoss(reduction='mean',
                                     margin=args.margin).to(device)
    # generate a mapping for dev, to ensure sampling bias is reduced
    num_target = len(fts['asins'])

    batch_size = args.batch_size
    ranker = Ranker.Ranker(device)
    total_step = math.floor(num_target / batch_size)

    ranking_tracker = [0] * args.num_dialog_turns
    loss_tracker = [0] * args.num_dialog_turns

    with open('data/shuffled.{}.{}.json'.format(args.data_set, 'test')) as f:
        first_candidate_set = json.load(f)

    with torch.no_grad():
        retriever.eval()
        ranker.update_emb(fts, args.batch_size, retriever)

    retriever.eval()
    ret_results = {}
    total_time = 0

    for step in tqdm.tqdm(range(total_step)):
        # sample target
        target_ids = torch.tensor([
            i for i in range(step * batch_size, (step + 1) * batch_size)
        ]).to(device=device, dtype=torch.long)

        # sample first batch of candidates
        candidate_ids = torch.tensor([
            first_candidate_set[i]
            for i in range(step * batch_size, (step + 1) * batch_size)
        ],
                                     device=device,
                                     dtype=torch.long)

        # keep track of results
        ret_result = {}
        for batch_id in range(target_ids.size(0)):
            idx = target_ids[batch_id].cpu().item()
            ret_result[idx] = {}
            ret_result[idx]['candidate'] = []
            ret_result[idx]['ranking'] = []
            ret_result[idx]['caption'] = []

        target_img_ft = utils.get_image_batch(fts, target_ids)
        target_img_ft = target_img_ft.to(device)
        target_img_emb = retriever.encode_image(target_img_ft)

        target_attr = utils.get_attribute_batch(fts, target_ids)
        target_attr = target_attr.to(device)

        # clean up dialog history tracker
        retriever.init_hist()
        # history_hidden = history_hidden.expand_as(target_img_emb)

        loss = 0

        for d_turn in range(args.num_dialog_turns):
            last_timer = int(round(time.time() * 1000))
            # get candidate image features
            candidate_img_ft = utils.get_image_batch(fts, candidate_ids)
            candidate_img_ft = candidate_img_ft.to(device)

            candidate_attr = utils.get_attribute_batch(fts, candidate_ids)
            candidate_attr = candidate_attr.to(device)
            # generate captions from model
            total_time += (int(round(time.time() * 1000)) - last_timer)
            with torch.no_grad():
                sentence_ids, caps = captioner.get_caption(target_img_ft,
                                                           candidate_img_ft,
                                                           target_attr,
                                                           candidate_attr,
                                                           return_cap=True)
            last_timer = int(round(time.time() * 1000))
            sentence_ids = sentence_ids.to(device)

            candidate_img_ft = candidate_img_ft.to(device)

            history_hidden = retriever.forward(text=sentence_ids,
                                               image=candidate_img_ft,
                                               attribute=candidate_attr)

            # sample negatives, update tracker's output to
            # match targets via triplet loss
            negative_ids = torch.tensor([0] * args.batch_size,
                                        device=device,
                                        dtype=torch.long)
            negative_ids.random_(0, num_target)

            negative_img_ft = utils.get_image_batch(fts, negative_ids)
            negative_img_ft = negative_img_ft.to(device)
            negative_img_emb = retriever.encode_image(negative_img_ft)

            # accumulate loss
            loss_tmp = criterion(history_hidden, target_img_emb,
                                 negative_img_emb)
            loss += loss_tmp
            loss_tracker[d_turn] += loss_tmp.item()

            # generate new candidates, compute ranking information
            with torch.no_grad():
                candidate_ids = ranker.nearest_neighbors(history_hidden)
                ranking = ranker.compute_rank(history_hidden, target_ids)
            ranking_tracker[d_turn] += (ranking.mean().item() /
                                        (num_target * 1.0))

            for batch_id in range(target_ids.size(0)):
                idx = target_ids[batch_id].cpu().item()
                ret_result[idx]['caption'].append(caps[batch_id])
                ret_result[idx]['candidate'].append(
                    candidate_ids[batch_id].item())
                ret_result[idx]['ranking'].append(ranking[batch_id].item())

            total_time += (int(round(time.time() * 1000)) - last_timer)

        ret_results.update(ret_result)

    loss = loss.item() / total_step
    for i in range(args.num_dialog_turns):
        ranking_tracker[i] /= total_step
        loss_tracker[i] /= total_step

    metrics = {
        'loss': loss,
        'score': 5 - sum(ranking_tracker),
        'loss_tracker': loss_tracker,
        'ranking_tracker': ranking_tracker,
        'retrieve_time': total_time / float(num_target)
    }
    return metrics, ret_results
コード例 #11
0
def main(argv):
    collectionFile = ''
    tokenizerType = ''
    queriesFile = ''
    rankType = ''
    start = []
    end = []
    try:
        opts, args = getopt.getopt(argv, "hf:t:q:r:", ["collectionFile=", "tokenizerType=", "queriesFilePath=",
                                                     "rankType="])
    except getopt.GetoptError:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25>')
        sys.exit()

    if len(opts) != 4:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25>')
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
                  '-r <rankType: 0 - TF-IDF, 1 - BM25>')
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.')
                sys.exit()
            tokenizerType = arg
        elif opt in ("-q", "--queriesFilePath"):
            if not path.exists(arg):
                print('Incorrect path to queries file.')
                sys.exit()
            queriesFile = arg
        elif opt in ("-r", "--rankType"):
            if arg != '0' and arg != '1':
                print('Incorrect rank type. TF-IDF: 0, BM25: 1.')
                sys.exit()
            rankType = arg

    # Indexer
    (Indexer(collectionFile, tokenizerType)).writeIndexToFile('index')

    f = open(queriesFile, 'r')
    queries = f.read().splitlines()
    f.close()

    scores = []

    if tokenizerType == '0':  # simple
        tokenizer = Tokenizer.SimpleTokenizer('')
    else:  # better
        tokenizer = Tokenizer.BetterTokenizer('')

    for query in queries:

        # Query Operations
        tokenizer.changeText(query)
        queryTerms = tokenizer.getTerms()

        
        # Searcher
        documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index')

        # Ranker
        ranker = Ranker(documentsInfo, avgDocLen)
        
        # Start time (latency purpose)
        start.append(timer())
        # If rankType = 0 (tf-idf)
        if rankType == '0':
            scores += [ranker.lnc_ltc()]
        # If rankType = 1 (BM25)
        else:
            scores += [ranker.bm25(1.2, 0.75)]

        # End time (latency purpose)
        end.append(timer())

    # Evaluation
    Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start, end)
コード例 #12
0
def main(argv):

    # ----------------------------------------- HANDLING PROGRAM INPUT -------------------------------------------------
    collectionFile = ''
    tokenizerType = ''
    queriesFile = ''
    rankType = ''
    storePos = ''
    proximity = ''
    try:
        opts, args = getopt.getopt(argv, "hf:t:q:r:p:b:", ["collectionFile=", "tokenizerType=", "queriesFilePath=",
                                                     "rankType=", "storePositions=", "proximityBoost="])
    except getopt.GetoptError:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
        sys.exit()

    if len(opts) != 6:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
                  '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.')
                sys.exit()
            tokenizerType = arg
        elif opt in ("-q", "--queriesFilePath"):
            if not path.exists(arg):
                print('Incorrect path to queries file.')
                sys.exit()
            queriesFile = arg
        elif opt in ("-r", "--rankType"):
            if arg != '0' and arg != '1':
                print('Incorrect rank type. TF-IDF: 0, BM25: 1.')
                sys.exit()
            rankType = arg
        elif opt in ("-p", "--storePositions"):
            if arg != '0' and arg != '1':
                print('\nIncorrect store positions choice. No: 0, Yes: 1.')
                sys.exit()
            storePos = arg
        elif opt in ("-b", "--proximityBoost"):
            if arg != '0' and arg != '1':
                print('\nIncorrect proximity boost choice. No: 0, Yes: 1.')
                sys.exit()
            proximity = arg

    # ----------------------------------------------- INDEXER ----------------------------------------------------------
    indexer = Indexer(collectionFile, tokenizerType, True if storePos=='1' else False)

    start = timeit.default_timer()
    indexer.index()
    stop = timeit.default_timer()

    print('Indexing total time - {} tokenizer: {} min and {} seconds'.format("simple" if tokenizerType == "0" else "better", (stop - start)//60, (stop - start) % 60))

    f = open(queriesFile, 'r')
    queries = f.read().splitlines()
    f.close()

    scores = []

    if tokenizerType == '0':  # simple
        tokenizer = Tokenizer.SimpleTokenizer('')
    else:  # better
        tokenizer = Tokenizer.BetterTokenizer('')

    start_queries = []
    end_queries = []
    time_searcher = 0
    time_ranker = 0
    for query in queries:

        # --------------------------------------- QUERY OPERATIONS -----------------------------------------------------
        tokenizer.changeText(query)

        #queryTerms, queryTermsPositions = tokenizer.getTerms(withPositions=True if storePos == '1' else False)
        queryTerms = tokenizer.getTerms(withPositions=False)

        # ------------------------------------------- SEARCHER ---------------------------------------------------------
        start = timeit.default_timer()
        documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index', True if storePos == '1' else False)
        stop = timeit.default_timer()
        time_searcher = time_searcher + stop - start

        # -------------------------------------------- RANKER ----------------------------------------------------------'
        start = timeit.default_timer()
        ranker = Ranker(documentsInfo, avgDocLen)
        
        # Start time (latency purpose)
        start_queries.append(timer())
        # If rankType = 0 (tf-idf)
        if rankType == '0':
            # If proximity = 1 (Proximity Boost)
            if proximity == '1':
                scores += [ranker.proximity_boost(ranker.lnc_ltc(), queryTerms)]
            else:
                scores += [ranker.lnc_ltc()]
        # If rankType = 1 (BM25)
        else:
            # If proximity = 1 (Proximity Boost)
            if proximity == '1':
                scores += [ranker.proximity_boost(ranker.bm25(1.2, 0.75), queryTerms)]
            else:
                scores += [ranker.bm25(1.2, 0.75)]

        stop = timeit.default_timer()
        time_ranker = time_ranker + stop - start

        # End time (latency purpose)
        end_queries.append(timer())


    print('Searching time for all queries: {} min and {} seconds'.format(time_searcher // 60, time_searcher % 60))
    print('Ranking time for all queries: {} min and {} seconds'.format(time_ranker // 60, time_ranker % 60))

    # Evaluation
    Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start_queries, end_queries)
コード例 #13
0
    def write_out(self,
                  epoch,
                  names_test,
                  num_img_test,
                  inp_feat_test,
                  rank_algo,
                  out_file='results_nn'):

        start_idx = -1
        end_idx = 0

        pred_file = open(out_file, 'r+')
        pred_file.seek(0)

        for i in range(len(num_img_test)):

            print "Predicting for video: ", i + 78
            vid_name = "video_" + str(i + 78)
            start_idx = end_idx
            end_idx = start_idx + num_img_test[i]
            interest_vec_sp = 0.0 * np.array(range(num_img_test[i]))
            interest_vec_pp = 0.0 * np.array(range(num_img_test[i]))
            binary_int = 0 * np.array(range(num_img_test[i]))
            num_int = int(0.12 * num_img_test[i]) + 1
            cur_PPM = np.zeros([num_img_test[i], num_img_test[i]])

            for j in range(start_idx, end_idx, 1):
                for k in range(j + 1, end_idx, 1):
                    pred = self.model.predict(
                        np.append(inp_feat_test[j],
                                  inp_feat_test[k]).reshape(1, -1))[0][0]
                    pred_int = int(np.round(pred))
                    cur_PPM[j - start_idx, k - start_idx] = pred_int

                    interest_vec_pp[pred_int * (k - start_idx) +
                                    (1 - pred_int) *
                                    (j - start_idx)] += abs(pred - 0.5)
                    interest_vec_sp[
                        pred_int * (k - start_idx) + (1 - pred_int) *
                        (j - start_idx
                         )] += 1  #Less interesting image gets 1 added in score

            if (rank_algo == 'sp'):
                interest_vec = interest_vec_sp
                temp = interest_vec.argsort()
                ranks = np.arange(len(interest_vec))[temp.argsort()]
                ranks = ranks + 1

            elif (rank_algo == 'pp'):
                interest_vec = interest_vec_pp
                temp = interest_vec.argsort()
                ranks = np.arange(len(interest_vec))[temp.argsort()]
                ranks = ranks + 1

            elif (rank_algo == 'mih_to'):
                ranks = Ranker.mih_to(cur_PPM)

            elif (rank_algo == 'mih_ro'):
                ranks = Ranker.mih_ro(cur_PPM)

            binary_int = (ranks <= num_int).astype(int)

            print "Predictions made for video: ", i + 78
            for j in range(num_img_test[i]):
                str_line = vid_name + ',' + str(names_test[i][j]) + ',' + str(
                    binary_int[j]) + ',' + str(1.0 / ranks[j])
                pred_file.write(str_line + '\n')
        pred_file.close()
        return
コード例 #14
0
ファイル: Master Scheduler.py プロジェクト: mellamok/Soapbox
def cycle_1_min():
    #Pass the timestamp to the various programs
    now = datetime.datetime.now()

    #File components
    iteration = now
    iteration_str = "{year} {month} {day} {hour} {minute}".format(
        year=now.year,
        month=now.month,
        day=now.day,
        hour=now.hour,
        minute=now.minute)
    iteration_file_puller = "Puller {year} {month} {day} {hour} {minute}.sqlite".format(
        year=now.year,
        month=now.month,
        day=now.day,
        hour=now.hour,
        minute=now.minute)

    #Do Puller
    pullerids = [
        "%23soapbox -filter:retweets -filter:media",
        "@internetsoapbox -filter:retweets -filter:media",
    ]

    print("Pulling file = ", iteration_file_puller)
    startpull = Puller.Scrape(iteration_file_puller, pullerids)
    startpull.main()
    print("Pulling complete!")
    pullerfinish = datetime.datetime.now() - now
    print("Time to Pull: ", pullerfinish)

    #Do Alltime Ranker
    print("Ranking alltime...")
    alltime = Ranker.Ranker(iteration, iteration_str, "alltime")
    alltime.main()
    print("Ranking alltime complete!")

    #Do Daily Ranker
    print("Ranking daily...")
    daily = Ranker.Ranker(iteration, iteration_str, "daily")
    daily.main()
    print("Ranking daily complete!")

    #Do Hourly Ranker
    print("Ranking hourly...")
    hourly = Ranker.Ranker(iteration, iteration_str, "hourly")
    hourly.main()
    print("Ranking hourly complete!")

    #Program Time
    finishtime = datetime.datetime.now() - now
    print("Time to Complete: ", finishtime)

    #return startvar and stable iterations
    global stable_iteration
    stable_iteration = iteration
    global stable_iteration_str
    stable_iteration_str = iteration_str

    global regen_popcorn_flag
    regen_popcorn_flag = 1

    global first_start_var
    first_start_var = 1