def __init__(self, files_by_basename): files_by_lower_basename = dict() for basename,files_with_basename in files_by_basename.items(): lower_basename = basename.lower() if lower_basename in files_by_lower_basename: files_by_lower_basename[lower_basename].extend(files_with_basename) else: files_by_lower_basename[lower_basename] = files_with_basename self.basenames_unsplit = ("\n" + "\n".join(files_by_basename.keys()) + "\n").encode('utf8') self.lower_basenames_unsplit = ("\n" + "\n".join(files_by_lower_basename.keys()) + "\n").encode('utf8') assert type(self.lower_basenames_unsplit) == str ranker = Ranker() wordstarts = {} for basename,files_with_basename in files_by_basename.items(): start_letters = ranker.get_start_letters(basename) if len(start_letters) <= 1: continue lower_basename = basename.lower() for i in range(len(start_letters) + 1 - 2): # abcd -> ab abc abcd ws = ''.join(start_letters[0:2+i]) if ws not in wordstarts: wordstarts[ws] = [] loss = len(start_letters) - (2 + i) wordstarts[ws].append((lower_basename, loss)) # now, order the actual entries so high qualities are at front self.basenames_by_wordstarts = {} for ws,items in wordstarts.iteritems(): items.sort(lambda x,y: cmp(x[1],y[1])) self.basenames_by_wordstarts[ws] = [i[0] for i in items]
def __init__(self): self.du = DU() self.vocab, self.recab = self.du.initialize_vocabulary() self.ids_arr = [] for line in open(self.du.ids_path): line = line.strip() if len(line) > 0: temp = line.split(' ') for i in range(len(temp)): temp[i] = int(temp[i]) self.ids_arr.append(temp) else: self.ids_arr.append([]) self.mark = json.load(open(self.du.mark_path)) self.train = json.load(open(self.du.train_path)) self.dev = json.load(open(self.du.dev_path)) self.test = json.load(open(self.du.test_path)) self.model = Ranker( vocab_size=FLAGS.vocab_size, embedding_size=FLAGS.emd_size, memory_size=FLAGS.mem_size, batch_size=FLAGS.batch_size, max_dialogue_size=FLAGS.max_dialogue_size, max_sentence_size=FLAGS.max_sentence_size, margin=FLAGS.margin, max_gradient_norm=FLAGS.max_gradient_norm, learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor, use_lstm=False, train_mode=FLAGS.train, # drop_out = FLAGS.drop_out, # layer = FLAGS.layer )
def __init__(self, parser, indexer, model=None, model_1=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self._model_1 = model_1 self.spellcheck = Spell_check()
def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self.terms_searched = {} self.total_num_of_docs = parser.curr_idx
def add_all_matching(self, hits, query, flt_tuple, max_hits): """ hits is the dictionary to put results in query is the query string originally entered by user, used by ranking flt_tuple is [filter_regex, case_sensitive_bool] max_hits is largest hits should grow before matching terminates. """ flt, case_sensitive = flt_tuple regex = re.compile(flt) base = 0 ranker = Ranker() if not case_sensitive: index = self.lower_basenames_unsplit else: index = self.basenames_unsplit while True: m = regex.search(index, base) if m: hit = m.group(0)[1:-1] if hit.find('\n') != -1: raise Exception("Somethign is messed up with flt=[%s] query=[%s] hit=[%s]" % (flt,query,hit)) rank = ranker.rank(query, hit) if case_sensitive: hit = hit.lower() if hit in hits: hits[hit] = max(hits[hit],rank) else: hits[hit] = rank base = m.end() - 1 if len(hits) >= max_hits: truncated = True break else: break
def __init__(self): super().__init__() self.model_lm = LanguageModel() self.model_ct = ContentTransfer() self.kb = KnowledgeBase() self.ranker = Ranker(self.model_lm) self.local = True
def add_all_wordstarts_matching(self, hits, query, max_hits): lower_query = query.lower() if lower_query in self.basenames_by_wordstarts: ranker = Ranker() for basename in self.basenames_by_wordstarts[lower_query]: rank = ranker.rank(query, basename) hits[basename] = rank if len(hits) >= max_hits: return
def add_all_wordstarts_matching( self, hits, query, max_hits ): lower_query = query.lower() if lower_query in self.basenames_by_wordstarts: ranker = Ranker() for basename in self.basenames_by_wordstarts[lower_query]: rank = ranker.rank(query, basename) hits[basename] = rank if len(hits) >= max_hits: return
def __init__(self, parser, indexer, config, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker(config) self._model = model self._the_count = config.the_count self._wordnet_count = config.wordnet_count self._min_relevant = config.min_relevant self._ext_val = config.ext_val
class Recommender: def __init__(self): self.user = User() self.ranker = Ranker() def build_states(self, states = None): """ Add the states to the ranker in the proper format """ if states is not None: self.ranker.states = states else: import os path = os.path.abspath(os.path.dirname(__file__)) json_data = open(os.path.join(path,'crawler/results/state.json')) data = json.load(json_data) json_data.close() dict = data[1] for state in dict.iterkeys(): state_vect = [] for i in range(1, len(dict[state])): state_vect.append(dict[state][i] / float(dict[state][0])) dict[state] = state_vect self.ranker.states = dict def build_counties(self, counties = None): """ Add the counties to the ranker in the proper format """ if counties is not None: self.ranker.counties = counties else: import os path = os.path.abspath(os.path.dirname(__file__)) json_data = open(os.path.join(path,'crawler/results/county.json')) data = json.load(json_data) json_data.close() dict = data[1] for county in dict.iterkeys(): county_vect = [] for i in range(1, len(dict[county])): county_vect.append(dict[county][i] / float(dict[county][0])) dict[county] = county_vect self.ranker.counties = dict def set_user(self, user): self.user = user def recommend_states(self): return self.ranker.rank_states(self.user.get_vector()) def recommend_counties(self): return self.ranker.rank_counties(self.user.get_vector())
def inference(): preprocessor = Preprocessor(first_time=False) preprocessor.preprocess() dataset = Dataset(preprocessor) mf = MF(preprocessor, dataset) mf.load() i2i = Item2Item(dataset) candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) ranker = Ranker() ranker.load() X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit() try: with open('submit_puke.pkl', 'wb') as f: pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f) except: print("Couldn't save submit_puke") # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb')) rank_scores = ranker.rank(X_submit) base = 0 entire_articles = [] not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items) not_heavy_items = sorted(not_heavy_items) cut = 50 random.seed(0) with result_path.open('w') as fout: for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)): articles = X_article_nums[base:base+group_size] scores = rank_scores[base:base+group_size] articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)] articles = articles[:cut] from_followable = candidate_generator.get_readers_followable_articles(reader) # from_keywords = candidate_generator.get_readers_keyword_articles(reader) for item in from_followable: if len(articles) >= cut + 15: break if item in articles: continue articles.append(item) while len(articles) < 100: item = random.choice(not_heavy_items) if item not in articles: articles.append(item) entire_articles.extend(articles) reader_str = preprocessor.num2reader[reader] article_strs = map(preprocessor.num2article.get, articles) fout.write('%s %s\n' % (reader_str, ' '.join(article_strs))) base += group_size print('Entropy of candidates = ', entropy(entire_articles))
def __init__(self): super().__init__() self.model_mrc = BidafQA() self.model_cmr = ConversingByReading() self.model_open = DialoGPT() self.kb = KnowledgeBase() model_mmi = DialoGPT(path_model='models/DialoGPT/small_reverse.pkl') self.ranker = Ranker(self.model_open, model_mmi) self.local = True
def __init__(self, filename): x = 5000 sys.setrecursionlimit(x) #names, cnf = IO.read_dimacs('SPLOT-3CNF-FM-500-50-1.00-SAT-10') self.items = SATSolver.get_solutions(10000, filename) self.weights = [1] * len(self.items) self.tree = sway(self.items, 100) self.names = [] #names # Weight of top node = 0 # self.tree.weight = 0 self.rank = Ranker.level_rank_features(self.tree, self.weights) self.cur_best_node = Ranker.rank_nodes(self.tree, self.rank) self.questions = IO.get_question_text('terms_sentence_map.csv', 'sentence')
def __init__(self, parser, indexer, model=None): self.config = indexer.config self._parser = parser self._indexer = indexer self.number_of_docs = indexer.num_of_docs self._model = model # self.inverted_index, self.document_dict = self._indexer.load_index("idx_engine1.pkl") self.inverted_index, self.document_dict = self._indexer.inverted_idx, self._indexer.document_dict self.glove_dict = self._indexer.glove_dict use_glove = True if len(self.glove_dict) == 0: use_glove = False self.ranker = Ranker(self.config, use_glove)
def check_solution(self): if sum(self.rank) == 0: return Search.get_all_items(self.tree) value = Ranker.check_solution(self.tree) if value is None: return None return Search.get_all_items(self.tree)
def __init__(self, parser, output_path, stem): """ :param inverted_index: dictionary of inverted index """ self.parser = parser self.ranker = Ranker(output_path, stem) self.path = output_path self.counter = 1 self.stem = stem self.lda_model = None self.dictionary = None self.dict = None self.documents = None self.docslen = 0 self.documentfilenames = { 'zero_documents': 0, 'first_documents': 0, 'second_documents': 0, 'third_documents': 0, 'fourth_documents': 0, 'fifth_documents': 0, 'sixth_documents': 0, 'seventh_documents': 0, 'eighth_documents': 0, 'ninth_documens': 0 }
def scheduled_job(): """ This job is run every monday at 12. """ now = datetime.datetime.now() podcasts = Ranker('internet-tecnologia',445,5).build() Storage.save('storage/ranking_{0}-{1}-{2}.json'.format(now.year,now.strftime('%m'),now.strftime('%d')), podcasts)
def __init__(self, inverted_index, posting_file=None): """ :param inverted_index: dictionary of inverted index """ self.ranker = Ranker() self.inverted_index = inverted_index self.posting_file = posting_file
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_dict = self._parser.parse_query(query) # thesaurus for word in query_as_dict.copy().keys(): if len(thes.synonyms(word)[1][1]): syn = list(thes.synonyms(word)[1][1])[:30] for s in syn: if s not in query_as_dict and s in self._indexer.inverted_idx: query_as_dict[s] = 1 break relevant_docs = self._relevant_docs_from_posting(query_as_dict) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) # print("SE4 top5:") # print(ranked_doc_ids[:5]) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs): ''' Constructs spider instance from command=line or scrapyd daemon. :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage :param save_html: boolean 0/1 :param use_splash: boolean 0/1 :param screenshot_dir: used only when use_splash=1 :param op_time: operating time in minutes, negative - don't use that constraint :param kwargs: :return: ''' super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs) self.screenshot_dir = screenshot_dir log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG) if seed_urls: self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')] self.ranker = Ranker.load() self.linkextractor = LinkExtractor() self.save_html = bool(save_html) self.use_splash = bool(use_splash) self.operating_time = int(op_time) * 60 self.start_time = datetime.utcnow() self.finishing = False
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_dict = self._parser.parse_query(query) # wordnet for word in query_as_dict.copy().keys(): syn = [] # if word not in self._indexer.inverted_idx: for synset in wordnet.synsets(word): for lemma in synset.lemmas(): syn.append(lemma.name().replace('_', ' ')) # add the synonyms for s in syn: if s not in query_as_dict and s in self._indexer.inverted_idx: query_as_dict[s] = 1 break relevant_docs = self._relevant_docs_from_posting(query_as_dict) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) n_relevant = len(ranked_doc_ids) # print("SE1 top5:") # print(ranked_doc_ids[:5]) return n_relevant, ranked_doc_ids
def __init__(self, config=None): self._config = config # self._parser = Parse() self._parser = Parse(self._config) self._indexer = Indexer(self._config) self._ranker = Ranker() self._model = None
def main(): experiment_set = final_experiment print("There are {} experiments to run".format(len(experiment_set))) train_data_path = "data/training.dat" dev_data_path = "data/full/dev.dat" tst_data_path = "data/full/evaluation.dat" feats_path = "data/model.features" num_feats = len([line for line in open(feats_path)]) batch_size = 80 runs_per_experiment = 5 for experiment_name in experiment_set.keys(): logger.info("Running experiment {}".format(experiment_name)) exp_features = experiment_set[experiment_name] out_path = 'output/experiments_v3/{}'.format(experiment_name) makedirs(out_path, exist_ok=True) train_instances = load_data(train_data_path, num_feats, exp_features) dev_instances = load_data(dev_data_path, num_feats, exp_features) dev_eval_instances = load_eval_data(dev_data_path, num_feats, exp_features) tst_instances = load_eval_data(tst_data_path, num_feats, exp_features) logger.info("Loaded {} training instances with {} features".format( len(train_instances), num_feats)) for i in range(runs_per_experiment): iter_path = out_path + '/v{}'.format(i) makedirs(iter_path, exist_ok=True) ranker = Ranker(num_feats, 256) trainer = RankerTrainer(ranker, batch_size, iter_path) trainer.train(train_instances, dev_instances, None, dev_eval_instances, tst_instances)
def train_and_score_mongo(): """ Rescore all items from mongo """ print "**************Training*********************" train_on_user_input() print "**************Scoring and Indexing*****************" mmu = MemexMongoUtils() docs = mmu.list_all_urls_iterator(return_html = True) ranker = Ranker.load() for doc in tqdm(docs, leave = True): try: score = ranker.score_doc(doc) except: score = 0 mmu.set_score(doc["url"], score) _score_hosts()
argprse.add_argument("-c", "--hsv", required = True, help = "File Path where the computed hsv index is saved") argprse.add_argument("-t", "--texture", required = True, help = "File Path where the computed texture index is saved") argprse.add_argument("-b", "--btree", required = True, help = "File Path where the computed tree index is saved") argprse.add_argument("-q", "--query", required = True, help = "File Path to the query image") argmnts = vars(argprse.parse_args()) # loading the query image and describing its color, texture and tree features query_img = cv2.imread(argmnts["query"]) cfeats = cdes.describe_color(copy.copy(query_img)) texture = txdes.describe_texture(copy.copy(query_img)) tree = tdes.color_tree(copy.copy(query_img)) # ranking the images in our dataset based on the query image ranker = Ranker(argmnts["hsv"], argmnts["texture"], argmnts["btree"]) final_results = ranker.rank(cfeats, texture, tree) current_path = os.path.dirname(os.path.abspath(__file__)) # iterating over the final results for (score, resID) in final_results: # printing the image names in the order of increasing score print resID + " "+ str(score) source_path = argmnts["dataset"]+"/"+ resID dest_path = current_path+"/result/"+resID shutil.copy2(source_path,dest_path)
return rankings if __name__=="__main__": ''' sys.argv[1] => training data sys.argv[2] => data separator ''' a = Predictor(sys.argv[1], sys.argv[2]) users, items = a.store_data_relations() #~100MB #ratings, means = a.normalize_ratings(users) recommender = UserBasedPredictor(users) #first, without normalizing #recommender = UserBasedPredictor(ratings, means) #todo: checar quais os itens mais recomendados e se eles sao populares ou oq #todo: estudar o nivel de personalizacao desse babado b = Ranker(5) statistically_better = 0.0 for u in users.keys(): #print u, b.topRatings(recommender.getRecommendations(u)[:30]) a = b.maximizeKGreatItems(1, recommender.getRecommendations(u)[:60], items) if a: statistically_better += 1.0 print statistically_better print statistically_better/len(users.keys()) #TODO use euclidian distance #TODO chooses what gives the best rmse
response = arrs_ranker.get_last(int(n)) return response @route('/arrivals/first/<n>') def get_fist(n): """ Returns the first 'n' airports in arrivals ranking """ if not arrs_ranker: status = 500 response = "Ranker not initialized" else: status = 200 response = arrs_ranker.get_first(int(n)) return response if __name__ == "__main__": """ """ #start ranker print "starting ranker" arrs_ranker = Ranker() print "setting up ranking" arrs_ranker.setup_arrivals_ranking() print "starting server" #running server run(host='localhost', port=8080)
rankings = [(score,item) for item,score in scores.items()] rankings.sort(); rankings.reverse() return rankings if __name__=="__main__": ''' sys.argv[1] => training data sys.argv[2] => test data sys.argv[3] => data separator ''' training = Predictor(sys.argv[1], sys.argv[3]) training_users, training_items = training.store_data_relations() #~100MB recommender = NNCossNgbrPredictor(training_items, training_users) N = 10 ranker = Ranker(N) testing = Predictor(sys.argv[2], sys.argv[3]) test_users, test_items = testing.store_data_relations() ev = Evaluator(test_users, N) #TODO clean this interface! item_ids = list(set(training_items.keys() + test_items.keys())) #all unique items in the dataset hits = 0 div_metric1 = [] div_metric2 = [] recommended_ratings = [] for u in test_users.keys(): for i in test_users[u].keys(): user_items = []
class RankerTest(unittest.TestCase): def setUp(self): # self.basenames = json.load(open('test_data/cr_files_basenames.json')) self.ranker = Ranker() def test_is_wordstart(self): def check(s, expectations): assert len(s) == len(expectations) for i in range(len(s)): self.assertEquals(expectations[i], self.ranker._is_wordstart(s, i), "disagreement on index %i" % i) check("foo", [True, False, False]) check("fooBar", [True, False, False, True, False, False]) check("o", [True]) check("_", [True]) check("F", [True]) check("FooBar", [True, False, False, True, False, False]) check("Foo_Bar", [True, False, False, False, True, False, False]) check("_Bar", [True, True, False, False]) check("_bar", [True, True, False, False]) check("foo_bar", [True, False, False, False, True, False, False]) check(".h", [True, False]) check("a.h", [True, False, False]) check("__b", [True, False, True]) check("foo__bar", [True, False, False, False, False, True, False, False]) check("Foo3D", [True, False, False, True, True]) check("Foo33", [True, False, False, True, False]) check("x3d", [True, True, False]) # I could be convinced that 'd' is a wordstart. check("AAb", [True, True, False]) check("CCFra", [True, True, True, False, False]) def test_get_word_starts(self): data = { # This comment simply helps map indice to values # 1234567 '' : [], 'abc' : [0], 'abd_def' : [0, 4], 'ab_cd_ef' : [0, 3, 6], 'ab_' : [0], 'AA': [0, 1], 'AAbA': [0,1,3], 'Abc': [0], 'AbcDef': [0,3], 'Abc_Def': [0,4], } for word, expected_starts in data.items(): starts = self.ranker.get_starts(word) self.assertEquals(expected_starts, starts, "for %s, expect %s" % (word, expected_starts)) def assertBasicRankAndWordHitCountIs(self, expected_rank, expected_word_count, query, candidate): res = self.ranker._get_basic_rank(query, candidate) self.assertEquals(expected_rank, res[0]) self.assertEquals(expected_word_count, res[1]) def test_query_hits_on_word_starts(self): self.assertBasicRankAndWordHitCountIs(8, 4, 'rwhv', 'render_widget_host_view.cc') # test +1 for hitting all words self.assertBasicRankAndWordHitCountIs(6, 3, 'rwh', 'render_widget_host_view.cc') self.assertBasicRankAndWordHitCountIs(5.5, 2, 'wvi', 'render_widget_host_view_win.cc') # eew self.assertBasicRankAndWordHitCountIs(2, 1, 'w', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(2, 1, 'v', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(5, 2, 'evi', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'eWbViewImpl.cc') self.assertBasicRankAndWordHitCountIs(6, 0, 'ebewp', 'WebViewImpl.cc') def test_basic_rank_pays_attention_to_case(self): # these test that we aren't losing catching case transpitions self.assertBasicRankAndWordHitCountIs(4.5, 1, "rw", "rwf") self.assertBasicRankAndWordHitCountIs(4, 2, "rw", "rWf") def test_basic_rank_works_at_all(self): # these are generic tests self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.mm") self.assertBasicRankAndWordHitCountIs(29, 4, 'ccframerate', 'CCFrameRateController.cpp') def test_basic_rank_query_case_doesnt_influence_rank(self): a = self.ranker._get_basic_rank("Rwhvm", "render_widget_host_view_mac.h") b = self.ranker._get_basic_rank("rwhvm", "Render_widget_host_view_mac.h") self.assertEquals(a, b) def test_basic_rank_isnt_only_greedy(self): # this checks that we consider _mac and as a wordstart rather than macmm self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_macmm") def test_basic_rank_on_corner_cases(self): self.assertBasicRankAndWordHitCountIs(0, 0, "", "") self.assertBasicRankAndWordHitCountIs(0, 0, "", "x") self.assertBasicRankAndWordHitCountIs(0, 0, "x", "") self.assertBasicRankAndWordHitCountIs(2, 1, "x", "x") self.assertBasicRankAndWordHitCountIs(1, 0, "x", "yx") self.assertBasicRankAndWordHitCountIs(0, 0, "x", "abcd") def test_basic_rank_on_mixed_wordstarts_and_full_words(self): self.assertBasicRankAndWordHitCountIs(17, 3, "enderwhv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(15, 2, "idgethv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(14, 5, "rwhvmac", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h") def test_basic_rank_overconditioned_query(self): self.assertBasicRankAndWordHitCountIs(2, 1, 'test_thread_tab.py', 'tw') def test_basic_rank_on_suffixes_of_same_base(self): # render_widget.cpp should be ranked higher than render_widget.h # unless the query explicitly matches the .h or .cpp pass def test_rank_corner_cases(self): # empty self.assertEquals(0, self.ranker.rank('foo', '')) self.assertEquals(0, self.ranker.rank('', 'foo')) # undersized self.assertEquals(0, self.ranker.rank('foo', 'm')) self.assertEquals(0, self.ranker.rank('f', 'oom')) # overconditioned self.assertEquals(2, self.ranker.rank('test_thread_tab.py', 'tw')) def test_rank_subclasses_lower_ranked_than_base(self): # this tests that hitting all words counts higher than hitting some of the words base_rank = self.ranker.rank("rwhvm", "render_widget_host_view.h") subclass_rank = self.ranker.rank("rwhvm", "render_widget_host_view_subclass.h") self.assertTrue(base_rank > subclass_rank) def test_rank_order_for_hierarchy_puts_bases_first(self): names = ['render_widget_host_view_mac.h', 'render_widget_host_view_mac.mm', 'render_widget_host_view_mac_delegate.h', 'render_widget_host_view_mac_unittest.mm', 'render_widget_host_view_mac_editcommand_helper.mm', 'render_widget_host_view_mac_editcommand_helper.h' 'render_widget_host_view_mac_editcommand_helper_unittest.mm', ] self._assertRankDecreasesOrStaysTheSame("rwhvm", names) def _assertRankDecreasesOrStaysTheSame(self, query, names): """ Makes suer that the first element in the array has highest rank and subsequent items have decreasing or equal rank. """ ranks = [self.ranker.rank(query, n) for n in names] nw = [self.ranker.get_num_words(n) for n in names] basic_ranks = [self.ranker._get_basic_rank(query, n) for n in names] for i in range(1, len(ranks)): changeInRank = ranks[i] - ranks[i-1] self.assertTrue(changeInRank <= 0) def test_rank_order_prefers_capitals(self): # Ensure we still prefer capitals for simple queries The heuristics that # deal with order_puts_tests_second tends to break this. self.assertBasicRankAndWordHitCountIs(6, 3, 'wvi', 'WebViewImpl.cc') def test_rank_order_puts_tests_second(self): q = "ccframerate" a1 = self.ranker.rank(q, 'CCFrameRateController.cpp') a2 = self.ranker.rank(q, 'CCFrameRateController.h') b = self.ranker.rank(q, 'CCFrameRateControllerTest.cpp') # This is a hard test to pass because ccframera(te) ties to (Te)st # if you weight non-word matches equally. self.assertTrue(a1 > b); self.assertTrue(a2 > b); q = "chrome_switches" a1 = self.ranker.rank(q, 'chrome_switches.cc') a2 = self.ranker.rank(q, 'chrome_switches.h') b = self.ranker.rank(q, 'chrome_switches_uitest.cc') self.assertTrue(a1 > b); self.assertTrue(a2 > b); def test_rank_order_for_hierarchy_puts_prefixed_second(self): q = "ccframerate" a = self.ranker.rank(q, 'CCFrameRateController.cpp') b1 = self.ranker.rank(q, 'webcore_platform.CCFrameRateController.o.d') b2 = self.ranker.rank(q, 'webkit_unit_tests.CCFrameRateControllerTest.o.d') self.assertTrue(a > b1); # FAILS because ccframera(te) ties to (Te)st # self.assertTrue(a > b2); def test_rank_order_puts_tests_second_2(self): q = "ccdelaybassedti" a1 = self.ranker.rank(q, 'CCDelayBasedTimeSource.cpp') a2 = self.ranker.rank(q, 'CCDelayBasedTimeSource.h') b = self.ranker.rank(q, 'CCDelayBasedTimeSourceTest.cpp') self.assertTrue(a1 > b); self.assertTrue(a2 > b); q = "LayerTexture" a = self.ranker.rank(q, 'LayerTexture.cpp') b = self.ranker.rank(q, 'LayerTextureSubImage.cpp') self.assertTrue(a > b) def test_refinement_improves_rank(self): a = self.ranker.rank('render_', 'render_widget.cc') b = self.ranker.rank('render_widget', 'render_widget.cc') self.assertTrue(b > a)
def setUp(self): # self.basenames = json.load(open('test_data/cr_files_basenames.json')) self.ranker = Ranker()
def __init__(self): self.user = User() self.ranker = Ranker()
if __name__=="__main__": ''' sys.argv[1] => training data sys.argv[2] => test data sys.argv[3] => data separator ''' training = Predictor(sys.argv[1], sys.argv[3]) training_users, training_items = training.store_data_relations() #~100MB num_factors = 50 recommender = PureSVDPredictor(training_items, training_users, num_factors) #TODO remove redundancy wrt nncosngbr N = 10 ranker = Ranker(N) testing = Predictor(sys.argv[2], sys.argv[3]) test_users, test_items = testing.store_data_relations() ev = Evaluator(test_users, N) #TODO remove redundancy wrt nncosngbr item_ids = list(set(training_items.keys() + test_items.keys())) #all unique items in the dataset hits = 0 div_metric1 = [] div_metric2 = [] #recommended_ratings = [] for u in test_users.keys(): for i in test_users[u].keys(): #TODO encapsulate it