Example #1
0
  def __init__(self, files_by_basename):

    files_by_lower_basename = dict()
    for basename,files_with_basename in files_by_basename.items():
      lower_basename = basename.lower()
      if lower_basename in files_by_lower_basename:
        files_by_lower_basename[lower_basename].extend(files_with_basename)
      else:
        files_by_lower_basename[lower_basename] = files_with_basename

    self.basenames_unsplit = ("\n" + "\n".join(files_by_basename.keys()) + "\n").encode('utf8')
    self.lower_basenames_unsplit = ("\n" + "\n".join(files_by_lower_basename.keys()) + "\n").encode('utf8')
    assert type(self.lower_basenames_unsplit) == str

    ranker = Ranker()
    wordstarts = {}
    for basename,files_with_basename in files_by_basename.items():
      start_letters = ranker.get_start_letters(basename)
      if len(start_letters) <= 1:
        continue
      lower_basename = basename.lower()
      for i in range(len(start_letters) + 1 - 2): # abcd -> ab abc abcd
        ws = ''.join(start_letters[0:2+i])
        if ws not in wordstarts:
          wordstarts[ws] = []
        loss = len(start_letters) - (2 + i)
        wordstarts[ws].append((lower_basename, loss))

    # now, order the actual entries so high qualities are at front
    self.basenames_by_wordstarts = {}
    for ws,items in wordstarts.iteritems():
      items.sort(lambda x,y: cmp(x[1],y[1]))
      self.basenames_by_wordstarts[ws] = [i[0] for i in items]
Example #2
0
    def __init__(self):
        self.du = DU()
        self.vocab, self.recab = self.du.initialize_vocabulary()
        self.ids_arr = []
        for line in open(self.du.ids_path):
            line = line.strip()
            if len(line) > 0:
                temp = line.split(' ')
                for i in range(len(temp)):
                    temp[i] = int(temp[i])
                self.ids_arr.append(temp)
            else:
                self.ids_arr.append([])

        self.mark = json.load(open(self.du.mark_path))
        self.train = json.load(open(self.du.train_path))
        self.dev = json.load(open(self.du.dev_path))
        self.test = json.load(open(self.du.test_path))

        self.model = Ranker(
            vocab_size=FLAGS.vocab_size,
            embedding_size=FLAGS.emd_size,
            memory_size=FLAGS.mem_size,
            batch_size=FLAGS.batch_size,
            max_dialogue_size=FLAGS.max_dialogue_size,
            max_sentence_size=FLAGS.max_sentence_size,
            margin=FLAGS.margin,
            max_gradient_norm=FLAGS.max_gradient_norm,
            learning_rate=FLAGS.learning_rate,
            learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
            use_lstm=False,
            train_mode=FLAGS.train,
            #				drop_out 	   = FLAGS.drop_out,
            #				layer		   = FLAGS.layer
        )
Example #3
0
 def __init__(self, parser, indexer, model=None, model_1=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker()
     self._model = model
     self._model_1 = model_1
     self.spellcheck = Spell_check()
Example #4
0
 def __init__(self, parser, indexer, model=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker()
     self._model = model
     self.terms_searched = {}
     self.total_num_of_docs = parser.curr_idx
Example #5
0
  def add_all_matching(self, hits, query, flt_tuple, max_hits):
    """
    hits is the dictionary to put results in
    query is the query string originally entered by user, used by ranking
    flt_tuple is [filter_regex, case_sensitive_bool]
    max_hits is largest hits should grow before matching terminates.
    """
    flt, case_sensitive = flt_tuple

    regex = re.compile(flt)
    base = 0
    ranker = Ranker()
    if not case_sensitive:
      index = self.lower_basenames_unsplit
    else:
      index = self.basenames_unsplit
    while True:
      m = regex.search(index, base)
      if m:
        hit = m.group(0)[1:-1]
        if hit.find('\n') != -1:
          raise Exception("Somethign is messed up with flt=[%s] query=[%s] hit=[%s]" % (flt,query,hit))
        rank = ranker.rank(query, hit)
        if case_sensitive:
          hit = hit.lower()
        if hit in hits:
          hits[hit] = max(hits[hit],rank)
        else:
          hits[hit] = rank
        base = m.end() - 1
        if len(hits) >= max_hits:
          truncated = True
          break
      else:
        break
Example #6
0
    def __init__(self):
        super().__init__()

        self.model_lm = LanguageModel()
        self.model_ct = ContentTransfer()
        self.kb = KnowledgeBase()
        self.ranker = Ranker(self.model_lm)
        self.local = True
Example #7
0
 def add_all_wordstarts_matching(self, hits, query, max_hits):
     lower_query = query.lower()
     if lower_query in self.basenames_by_wordstarts:
         ranker = Ranker()
         for basename in self.basenames_by_wordstarts[lower_query]:
             rank = ranker.rank(query, basename)
             hits[basename] = rank
             if len(hits) >= max_hits:
                 return
Example #8
0
 def add_all_wordstarts_matching( self, hits, query, max_hits ):
   lower_query = query.lower()
   if lower_query in self.basenames_by_wordstarts:
     ranker = Ranker()
     for basename in self.basenames_by_wordstarts[lower_query]:
       rank = ranker.rank(query, basename)
       hits[basename] = rank
       if len(hits) >= max_hits:
         return
Example #9
0
 def __init__(self, parser, indexer, config, model=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker(config)
     self._model = model
     self._the_count = config.the_count
     self._wordnet_count = config.wordnet_count
     self._min_relevant = config.min_relevant
     self._ext_val = config.ext_val
Example #10
0
class Recommender:

    def __init__(self):
        self.user = User()
        self.ranker = Ranker()
	
    def build_states(self, states = None):
        """
            Add the states to the ranker in the proper format
        """
        if states is not None:
            self.ranker.states = states
        else:
            import os
            path = os.path.abspath(os.path.dirname(__file__))
            json_data = open(os.path.join(path,'crawler/results/state.json'))
            data = json.load(json_data)
            json_data.close()
            
            dict = data[1]
            for state in dict.iterkeys():
                state_vect = []
                for i in range(1, len(dict[state])):
                    state_vect.append(dict[state][i] / float(dict[state][0]))
                dict[state] = state_vect
            self.ranker.states = dict

    def build_counties(self, counties = None):
        """
            Add the counties to the ranker in the proper format
        """
        if counties is not None:
            self.ranker.counties = counties
        else:
            import os
            path = os.path.abspath(os.path.dirname(__file__))
            json_data = open(os.path.join(path,'crawler/results/county.json'))
            data = json.load(json_data)
            json_data.close()
            
            dict = data[1]
            for county in dict.iterkeys():
                county_vect = []
                for i in range(1, len(dict[county])):
                    county_vect.append(dict[county][i] / float(dict[county][0]))
                dict[county] = county_vect
            self.ranker.counties = dict

    def set_user(self, user):
        self.user = user

    def recommend_states(self):
        return self.ranker.rank_states(self.user.get_vector())

    def recommend_counties(self):
        return self.ranker.rank_counties(self.user.get_vector())
def inference():
    preprocessor = Preprocessor(first_time=False)
    preprocessor.preprocess()
    dataset = Dataset(preprocessor)
    mf = MF(preprocessor, dataset)
    mf.load()
    i2i = Item2Item(dataset)
    candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i)
    ranker = Ranker()
    ranker.load()

    X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit()
    try:
        with open('submit_puke.pkl', 'wb') as f:
            pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f)
    except:
        print("Couldn't save submit_puke")

    # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb'))

    rank_scores = ranker.rank(X_submit)
    base = 0
    entire_articles = []
    not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items)
    not_heavy_items = sorted(not_heavy_items)
    cut = 50

    random.seed(0)
    with result_path.open('w') as fout:
        for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)):
            articles = X_article_nums[base:base+group_size]
            scores = rank_scores[base:base+group_size]

            articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)]
            articles = articles[:cut]
            from_followable = candidate_generator.get_readers_followable_articles(reader)
            # from_keywords = candidate_generator.get_readers_keyword_articles(reader)
            for item in from_followable:
                if len(articles) >= cut + 15:
                    break
                if item in articles:
                    continue
                articles.append(item)
            while len(articles) < 100:
                item = random.choice(not_heavy_items)
                if item not in articles:
                    articles.append(item)
            entire_articles.extend(articles)

            reader_str = preprocessor.num2reader[reader]
            article_strs = map(preprocessor.num2article.get, articles)

            fout.write('%s %s\n' % (reader_str, ' '.join(article_strs)))

            base += group_size
    print('Entropy of candidates = ', entropy(entire_articles))
Example #12
0
    def __init__(self):
        super().__init__()

        self.model_mrc = BidafQA()
        self.model_cmr = ConversingByReading()
        self.model_open = DialoGPT()
        self.kb = KnowledgeBase()
        model_mmi = DialoGPT(path_model='models/DialoGPT/small_reverse.pkl')
        self.ranker = Ranker(self.model_open, model_mmi)
        self.local = True
Example #13
0
 def __init__(self, filename):
     x = 5000
     sys.setrecursionlimit(x)
     #names, cnf = IO.read_dimacs('SPLOT-3CNF-FM-500-50-1.00-SAT-10')
     self.items = SATSolver.get_solutions(10000, filename)
     self.weights = [1] * len(self.items)
     self.tree = sway(self.items, 100)
     self.names = [] #names
     # Weight of top node = 0
     # self.tree.weight = 0
     self.rank = Ranker.level_rank_features(self.tree, self.weights)
     self.cur_best_node = Ranker.rank_nodes(self.tree, self.rank)
     self.questions = IO.get_question_text('terms_sentence_map.csv', 'sentence')
Example #14
0
    def __init__(self, parser, indexer, model=None):
        self.config = indexer.config
        self._parser = parser
        self._indexer = indexer
        self.number_of_docs = indexer.num_of_docs
        self._model = model
        # self.inverted_index, self.document_dict = self._indexer.load_index("idx_engine1.pkl")
        self.inverted_index, self.document_dict = self._indexer.inverted_idx, self._indexer.document_dict

        self.glove_dict = self._indexer.glove_dict
        use_glove = True
        if len(self.glove_dict) == 0:
            use_glove = False
        self.ranker = Ranker(self.config, use_glove)
Example #15
0
 def check_solution(self):
     if sum(self.rank) == 0:
         return Search.get_all_items(self.tree)
     value = Ranker.check_solution(self.tree)
     if value is None:
         return None
     return Search.get_all_items(self.tree)
Example #16
0
 def __init__(self, parser, output_path, stem):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.parser = parser
     self.ranker = Ranker(output_path, stem)
     self.path = output_path
     self.counter = 1
     self.stem = stem
     self.lda_model = None
     self.dictionary = None
     self.dict = None
     self.documents = None
     self.docslen = 0
     self.documentfilenames = {
         'zero_documents': 0,
         'first_documents': 0,
         'second_documents': 0,
         'third_documents': 0,
         'fourth_documents': 0,
         'fifth_documents': 0,
         'sixth_documents': 0,
         'seventh_documents': 0,
         'eighth_documents': 0,
         'ninth_documens': 0
     }
def scheduled_job():
    """
        This job is run every monday at 12.
    """
    now = datetime.datetime.now()
    podcasts = Ranker('internet-tecnologia',445,5).build()
    Storage.save('storage/ranking_{0}-{1}-{2}.json'.format(now.year,now.strftime('%m'),now.strftime('%d')), podcasts)
Example #18
0
 def __init__(self, inverted_index, posting_file=None):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.ranker = Ranker()
     self.inverted_index = inverted_index
     self.posting_file = posting_file
Example #19
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_dict = self._parser.parse_query(query)

        # thesaurus
        for word in query_as_dict.copy().keys():
            if len(thes.synonyms(word)[1][1]):
                syn = list(thes.synonyms(word)[1][1])[:30]
                for s in syn:
                    if s not in query_as_dict and s in self._indexer.inverted_idx:
                        query_as_dict[s] = 1
                        break

        relevant_docs = self._relevant_docs_from_posting(query_as_dict)

        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        # print("SE4 top5:")
        # print(ranked_doc_ids[:5])
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
    def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs):
        '''
        Constructs spider instance from command=line or scrapyd daemon.

        :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage
        :param save_html: boolean 0/1
        :param use_splash: boolean 0/1
        :param screenshot_dir: used only when use_splash=1
        :param op_time: operating time in minutes, negative - don't use that constraint
        :param kwargs:
        :return:
        '''
        super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs)
        self.screenshot_dir = screenshot_dir
        log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG)

        if seed_urls:
            self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')]
        self.ranker = Ranker.load()
        self.linkextractor = LinkExtractor()
        self.save_html = bool(save_html)
        self.use_splash = bool(use_splash)
        self.operating_time = int(op_time) * 60

        self.start_time = datetime.utcnow()
        self.finishing = False
Example #21
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_dict = self._parser.parse_query(query)

        # wordnet
        for word in query_as_dict.copy().keys():
            syn = []
            # if word not in self._indexer.inverted_idx:
            for synset in wordnet.synsets(word):
                for lemma in synset.lemmas():
                    syn.append(lemma.name().replace('_',
                                                    ' '))  # add the synonyms
            for s in syn:
                if s not in query_as_dict and s in self._indexer.inverted_idx:
                    query_as_dict[s] = 1
                    break

        relevant_docs = self._relevant_docs_from_posting(query_as_dict)

        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        n_relevant = len(ranked_doc_ids)
        # print("SE1 top5:")
        # print(ranked_doc_ids[:5])
        return n_relevant, ranked_doc_ids
Example #22
0
 def __init__(self, config=None):
     self._config = config
     # self._parser = Parse()
     self._parser = Parse(self._config)
     self._indexer = Indexer(self._config)
     self._ranker = Ranker()
     self._model = None
Example #23
0
def main():
    experiment_set = final_experiment
    print("There are {} experiments to run".format(len(experiment_set)))
    train_data_path = "data/training.dat"
    dev_data_path = "data/full/dev.dat"
    tst_data_path = "data/full/evaluation.dat"
    feats_path = "data/model.features"
    num_feats = len([line for line in open(feats_path)])
    batch_size = 80
    runs_per_experiment = 5

    for experiment_name in experiment_set.keys():
        logger.info("Running experiment {}".format(experiment_name))
        exp_features = experiment_set[experiment_name]
        out_path = 'output/experiments_v3/{}'.format(experiment_name)
        makedirs(out_path, exist_ok=True)
        train_instances = load_data(train_data_path, num_feats, exp_features)
        dev_instances = load_data(dev_data_path, num_feats, exp_features)
        dev_eval_instances = load_eval_data(dev_data_path, num_feats,
                                            exp_features)
        tst_instances = load_eval_data(tst_data_path, num_feats, exp_features)
        logger.info("Loaded {} training instances with {} features".format(
            len(train_instances), num_feats))
        for i in range(runs_per_experiment):
            iter_path = out_path + '/v{}'.format(i)
            makedirs(iter_path, exist_ok=True)
            ranker = Ranker(num_feats, 256)
            trainer = RankerTrainer(ranker, batch_size, iter_path)
            trainer.train(train_instances, dev_instances, None,
                          dev_eval_instances, tst_instances)
def train_and_score_mongo():
    """ Rescore all items from mongo """
    
    print "**************Training*********************"
    train_on_user_input()


    print "**************Scoring and Indexing*****************"
    mmu = MemexMongoUtils()
    docs = mmu.list_all_urls_iterator(return_html = True)

    ranker = Ranker.load()
    for doc in tqdm(docs, leave = True):
        try:
            score = ranker.score_doc(doc)
        except:
            score = 0

        mmu.set_score(doc["url"], score)

    _score_hosts()
argprse.add_argument("-c", "--hsv", required = True,
	help = "File Path where the computed hsv index is saved")
argprse.add_argument("-t", "--texture", required = True,
	help = "File Path where the computed texture index is saved")
argprse.add_argument("-b", "--btree", required = True,
	help = "File Path where the computed tree index is saved")
argprse.add_argument("-q", "--query", required = True,
	help = "File Path to the query image")
argmnts = vars(argprse.parse_args())

# loading the query image and describing its color, texture and tree features
query_img = cv2.imread(argmnts["query"])
cfeats = cdes.describe_color(copy.copy(query_img))
texture = txdes.describe_texture(copy.copy(query_img))
tree = tdes.color_tree(copy.copy(query_img))
 
# ranking the images in our dataset based on the query image
ranker = Ranker(argmnts["hsv"], argmnts["texture"], argmnts["btree"])
final_results = ranker.rank(cfeats, texture, tree)

current_path = os.path.dirname(os.path.abspath(__file__))

# iterating over the final results
for (score, resID) in final_results:
	# printing the image names in the order of increasing score
	print resID + "    "+ str(score)
	source_path = argmnts["dataset"]+"/"+ resID
	dest_path = current_path+"/result/"+resID
	shutil.copy2(source_path,dest_path)

        return rankings


if __name__=="__main__":
    '''
    sys.argv[1] => training data
    sys.argv[2] => data separator
    '''
    a = Predictor(sys.argv[1], sys.argv[2])
    users, items = a.store_data_relations() #~100MB
    #ratings, means = a.normalize_ratings(users)
    
    recommender = UserBasedPredictor(users) #first, without normalizing

    #recommender = UserBasedPredictor(ratings, means)

    #todo: checar quais os itens mais recomendados e se eles sao populares ou oq
    #todo: estudar o nivel de personalizacao desse babado

    b = Ranker(5)
    statistically_better = 0.0
    for u in users.keys():
        #print u, b.topRatings(recommender.getRecommendations(u)[:30])
        a =  b.maximizeKGreatItems(1, recommender.getRecommendations(u)[:60], items)
        if a: statistically_better += 1.0
        print statistically_better
    print statistically_better/len(users.keys())

#TODO use euclidian distance
#TODO chooses what gives the best rmse
        response = arrs_ranker.get_last(int(n))
    return response


@route('/arrivals/first/<n>')
def get_fist(n):
    """
    Returns the first 'n' airports in arrivals ranking
    """
    if not arrs_ranker:
        status = 500
        response = "Ranker not initialized"    
    else:
        status = 200
        response = arrs_ranker.get_first(int(n))
    return response


if __name__ == "__main__":
    """
    """
    #start ranker
    print "starting ranker"
    arrs_ranker = Ranker()
    print "setting up ranking"
    arrs_ranker.setup_arrivals_ranking()
    print "starting server"
    #running server
    run(host='localhost', port=8080)

        rankings = [(score,item) for item,score in scores.items()]
        rankings.sort(); rankings.reverse()
        return rankings   

if __name__=="__main__":
    '''
    sys.argv[1] => training data
    sys.argv[2] => test data
    sys.argv[3] => data separator
    '''
    training = Predictor(sys.argv[1], sys.argv[3])
    training_users, training_items = training.store_data_relations() #~100MB
    recommender = NNCossNgbrPredictor(training_items, training_users) 

    N = 10
    ranker = Ranker(N)
    testing = Predictor(sys.argv[2], sys.argv[3])
    test_users, test_items = testing.store_data_relations()
    ev = Evaluator(test_users, N)


    #TODO clean this interface!
    item_ids = list(set(training_items.keys() + test_items.keys())) #all unique items in the dataset
    hits = 0
    div_metric1 = []
    div_metric2 = []
    recommended_ratings = []
    for u in test_users.keys():
        for i in test_users[u].keys():

            user_items = []
Example #29
0
class RankerTest(unittest.TestCase):
  def setUp(self):
#    self.basenames = json.load(open('test_data/cr_files_basenames.json'))
    self.ranker = Ranker()

  def test_is_wordstart(self):
    def check(s, expectations):
      assert len(s) == len(expectations)
      for i in range(len(s)):
        self.assertEquals(expectations[i], self.ranker._is_wordstart(s, i), "disagreement on index %i" % i)

    check("foo", [True, False, False])
    check("fooBar", [True, False, False, True, False, False])
    check("o", [True])
    check("_", [True])
    check("F", [True])
    check("FooBar", [True, False, False, True, False, False])
    check("Foo_Bar", [True, False, False, False, True, False, False])
    check("_Bar", [True, True, False, False])
    check("_bar", [True, True, False, False])
    check("foo_bar", [True, False, False, False, True, False, False])

    check(".h", [True, False])
    check("a.h", [True, False, False])
    check("__b", [True, False, True])
    check("foo__bar", [True, False, False, False, False, True, False, False])

    check("Foo3D", [True, False, False, True, True])
    check("Foo33", [True, False, False, True, False])

    check("x3d", [True, True,  False]) # I could be convinced that 'd' is a wordstart.

    check("AAb", [True, True, False])
    check("CCFra", [True, True, True, False, False])

  def test_get_word_starts(self):
    data = {
      # This comment simply helps map indice to values
      # 1234567
      '' : [],
      'abc' : [0],
      'abd_def' : [0, 4],
      'ab_cd_ef' : [0, 3, 6],
      'ab_' : [0],
      'AA': [0, 1],
      'AAbA': [0,1,3],
      'Abc': [0],
      'AbcDef': [0,3],
      'Abc_Def': [0,4],
      }
    for word, expected_starts in data.items():
      starts = self.ranker.get_starts(word)
      self.assertEquals(expected_starts, starts, "for %s, expect %s" % (word, expected_starts))

  def assertBasicRankAndWordHitCountIs(self, expected_rank, expected_word_count, query, candidate):
    res = self.ranker._get_basic_rank(query, candidate)
    self.assertEquals(expected_rank, res[0])
    self.assertEquals(expected_word_count, res[1])

  def test_query_hits_on_word_starts(self):
    self.assertBasicRankAndWordHitCountIs(8, 4, 'rwhv', 'render_widget_host_view.cc') # test +1 for hitting all words
    self.assertBasicRankAndWordHitCountIs(6, 3, 'rwh', 'render_widget_host_view.cc')
    self.assertBasicRankAndWordHitCountIs(5.5, 2, 'wvi', 'render_widget_host_view_win.cc') # eew
    self.assertBasicRankAndWordHitCountIs(2, 1, 'w', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(2, 1, 'v', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(5, 2, 'evi', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'eWbViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(6, 0, 'ebewp', 'WebViewImpl.cc')


  def test_basic_rank_pays_attention_to_case(self):
    # these test that we aren't losing catching case transpitions
    self.assertBasicRankAndWordHitCountIs(4.5, 1, "rw", "rwf")
    self.assertBasicRankAndWordHitCountIs(4, 2, "rw", "rWf")

  def test_basic_rank_works_at_all(self):
    # these are generic tests
    self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view.h")
    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h")
    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.mm")

    self.assertBasicRankAndWordHitCountIs(29, 4, 'ccframerate', 'CCFrameRateController.cpp')


  def test_basic_rank_query_case_doesnt_influence_rank(self):
    a = self.ranker._get_basic_rank("Rwhvm", "render_widget_host_view_mac.h")
    b = self.ranker._get_basic_rank("rwhvm", "Render_widget_host_view_mac.h")
    self.assertEquals(a, b)

  def test_basic_rank_isnt_only_greedy(self):
    # this checks that we consider _mac and as a wordstart rather than macmm
    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_macmm")

  def test_basic_rank_on_corner_cases(self):
    self.assertBasicRankAndWordHitCountIs(0, 0, "", "")
    self.assertBasicRankAndWordHitCountIs(0, 0, "", "x")
    self.assertBasicRankAndWordHitCountIs(0, 0, "x", "")
    self.assertBasicRankAndWordHitCountIs(2, 1, "x", "x")
    self.assertBasicRankAndWordHitCountIs(1, 0, "x", "yx")
    self.assertBasicRankAndWordHitCountIs(0, 0, "x", "abcd")

  def test_basic_rank_on_mixed_wordstarts_and_full_words(self):
    self.assertBasicRankAndWordHitCountIs(17, 3, "enderwhv", "render_widget_host_view.h")
    self.assertBasicRankAndWordHitCountIs(15, 2, "idgethv", "render_widget_host_view.h")

    self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view_mac.h")
    self.assertBasicRankAndWordHitCountIs(14, 5, "rwhvmac", "render_widget_host_view_mac.h")

    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h")

  def test_basic_rank_overconditioned_query(self):
    self.assertBasicRankAndWordHitCountIs(2, 1, 'test_thread_tab.py', 'tw')

  def test_basic_rank_on_suffixes_of_same_base(self):
    # render_widget.cpp should be ranked higher than render_widget.h
    # unless the query explicitly matches the .h or .cpp
    pass

  def test_rank_corner_cases(self):
    # empty
    self.assertEquals(0, self.ranker.rank('foo', ''))
    self.assertEquals(0, self.ranker.rank('', 'foo'))

    # undersized
    self.assertEquals(0, self.ranker.rank('foo', 'm'))
    self.assertEquals(0, self.ranker.rank('f', 'oom'))

    # overconditioned
    self.assertEquals(2, self.ranker.rank('test_thread_tab.py', 'tw'))

  def test_rank_subclasses_lower_ranked_than_base(self):
    # this tests that hitting all words counts higher than hitting some of the words
    base_rank = self.ranker.rank("rwhvm", "render_widget_host_view.h")
    subclass_rank = self.ranker.rank("rwhvm", "render_widget_host_view_subclass.h")
    self.assertTrue(base_rank > subclass_rank)

  def test_rank_order_for_hierarchy_puts_bases_first(self):
    names = ['render_widget_host_view_mac.h',
             'render_widget_host_view_mac.mm',
             'render_widget_host_view_mac_delegate.h',
             'render_widget_host_view_mac_unittest.mm',
             'render_widget_host_view_mac_editcommand_helper.mm',
             'render_widget_host_view_mac_editcommand_helper.h'
             'render_widget_host_view_mac_editcommand_helper_unittest.mm',
             ]
    self._assertRankDecreasesOrStaysTheSame("rwhvm", names)

  def _assertRankDecreasesOrStaysTheSame(self, query, names):
    """
    Makes suer that the first element in the array has highest rank
    and subsequent items have decreasing or equal rank.
    """
    ranks = [self.ranker.rank(query, n) for n in names]
    nw = [self.ranker.get_num_words(n) for n in names]
    basic_ranks = [self.ranker._get_basic_rank(query, n) for n in names]
    for i in range(1, len(ranks)):
      changeInRank = ranks[i] - ranks[i-1]
      self.assertTrue(changeInRank <= 0)

  def test_rank_order_prefers_capitals(self):
    # Ensure we still prefer capitals for simple queries The heuristics that
    # deal with order_puts_tests_second tends to break this.
    self.assertBasicRankAndWordHitCountIs(6, 3, 'wvi', 'WebViewImpl.cc')

  def test_rank_order_puts_tests_second(self):
    q = "ccframerate"
    a1 = self.ranker.rank(q, 'CCFrameRateController.cpp')
    a2 = self.ranker.rank(q, 'CCFrameRateController.h')
    b = self.ranker.rank(q, 'CCFrameRateControllerTest.cpp')

    # This is a hard test to pass because ccframera(te) ties to (Te)st
    # if you weight non-word matches equally.
    self.assertTrue(a1 > b);
    self.assertTrue(a2 > b);

    q = "chrome_switches"
    a1 = self.ranker.rank(q, 'chrome_switches.cc')
    a2 = self.ranker.rank(q, 'chrome_switches.h')
    b = self.ranker.rank(q, 'chrome_switches_uitest.cc')
    self.assertTrue(a1 > b);
    self.assertTrue(a2 > b);

  def test_rank_order_for_hierarchy_puts_prefixed_second(self):
    q = "ccframerate"
    a = self.ranker.rank(q, 'CCFrameRateController.cpp')
    b1 = self.ranker.rank(q, 'webcore_platform.CCFrameRateController.o.d')
    b2 = self.ranker.rank(q, 'webkit_unit_tests.CCFrameRateControllerTest.o.d')
    self.assertTrue(a > b1);
    # FAILS because ccframera(te) ties to (Te)st
    # self.assertTrue(a > b2);

  def test_rank_order_puts_tests_second_2(self):
    q = "ccdelaybassedti"
    a1 = self.ranker.rank(q, 'CCDelayBasedTimeSource.cpp')
    a2 = self.ranker.rank(q, 'CCDelayBasedTimeSource.h')
    b = self.ranker.rank(q, 'CCDelayBasedTimeSourceTest.cpp')
    self.assertTrue(a1 > b);
    self.assertTrue(a2 > b);

    q = "LayerTexture"
    a = self.ranker.rank(q, 'LayerTexture.cpp')
    b = self.ranker.rank(q, 'LayerTextureSubImage.cpp')
    self.assertTrue(a > b)

  def test_refinement_improves_rank(self):
    a = self.ranker.rank('render_', 'render_widget.cc')
    b = self.ranker.rank('render_widget', 'render_widget.cc')
    self.assertTrue(b > a)
Example #30
0
  def setUp(self):
#    self.basenames = json.load(open('test_data/cr_files_basenames.json'))
    self.ranker = Ranker()
Example #31
0
 def __init__(self):
     self.user = User()
     self.ranker = Ranker()
Example #32
0
if __name__=="__main__":

    '''
    sys.argv[1] => training data
    sys.argv[2] => test data
    sys.argv[3] => data separator
    '''
    training = Predictor(sys.argv[1], sys.argv[3])
    training_users, training_items = training.store_data_relations() #~100MB
    num_factors = 50
    recommender = PureSVDPredictor(training_items, training_users, num_factors)

    #TODO remove redundancy wrt nncosngbr
    N = 10
    ranker = Ranker(N)
    testing = Predictor(sys.argv[2], sys.argv[3])
    test_users, test_items = testing.store_data_relations()
    ev = Evaluator(test_users, N)


    #TODO remove redundancy wrt nncosngbr
    item_ids = list(set(training_items.keys() + test_items.keys())) #all unique items in the dataset
    hits = 0
    div_metric1 = []
    div_metric2 = []
    #recommended_ratings = []
    for u in test_users.keys():
        for i in test_users[u].keys():

            #TODO encapsulate it