def main(): experiment_set = final_experiment print("There are {} experiments to run".format(len(experiment_set))) train_data_path = "data/training.dat" dev_data_path = "data/full/dev.dat" tst_data_path = "data/full/evaluation.dat" feats_path = "data/model.features" num_feats = len([line for line in open(feats_path)]) batch_size = 80 runs_per_experiment = 5 for experiment_name in experiment_set.keys(): logger.info("Running experiment {}".format(experiment_name)) exp_features = experiment_set[experiment_name] out_path = 'output/experiments_v3/{}'.format(experiment_name) makedirs(out_path, exist_ok=True) train_instances = load_data(train_data_path, num_feats, exp_features) dev_instances = load_data(dev_data_path, num_feats, exp_features) dev_eval_instances = load_eval_data(dev_data_path, num_feats, exp_features) tst_instances = load_eval_data(tst_data_path, num_feats, exp_features) logger.info("Loaded {} training instances with {} features".format( len(train_instances), num_feats)) for i in range(runs_per_experiment): iter_path = out_path + '/v{}'.format(i) makedirs(iter_path, exist_ok=True) ranker = Ranker(num_feats, 256) trainer = RankerTrainer(ranker, batch_size, iter_path) trainer.train(train_instances, dev_instances, None, dev_eval_instances, tst_instances)
def __init__(self, docs, path): """ :param inverted_index: dictionary of inverted index """ self.ranker = Ranker(self) self.term_dict,self.document_dict = docs self.POSTING_PATH = path
def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self.terms_searched = {} self.total_num_of_docs = parser.curr_idx
def main(args): torch.manual_seed(333) if use_cuda: torch.cuda.manual_seed(333) random.seed(333) train_data_path = "data/training.dat" train_eval_data_path = "data/train-eval.dat" dev_data_path = "data/full/dev.dat" eval_data_path = "data/full/evaluation.dat" feats_path = "data/model.features" num_feats = len([line for line in open(feats_path)]) batch_size = 80 ranker = Ranker(num_feats, 256) ## Instances for training - loaded as pairs feat_indices = set([i for i in range(num_feats)]) train_instances = load_data(train_data_path, num_feats, feat_indices) train_eval_instances = load_eval_data(train_data_path, num_feats, feat_indices) dev_instances = load_data(dev_data_path, num_feats, feat_indices) dev_eval_instances = load_eval_data(dev_data_path, num_feats, feat_indices) tst_instances = load_eval_data(eval_data_path, num_feats, feat_indices) logger.info("Loaded {} training instances with {} features".format( len(train_instances), num_feats)) trainer = RankerTrainer(ranker, batch_size, 'output/') trainer.train(train_instances, dev_instances, train_eval_instances, dev_eval_instances, tst_instances) ranker.save('output/ranker.model')
def __init__(self, inverted_index, posting_file=None): """ :param inverted_index: dictionary of inverted index """ self.ranker = Ranker() self.inverted_index = inverted_index self.posting_file = posting_file
def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self._docs_dict = {} self.number_of_documents = len(indexer.docs_dict)
def __init__(self, tokenizer_mode, file='../content/metadata.csv', stopwords_file="../content/snowball_stopwords_EN.txt", chunksize=10000, queries_path='../content/queries.txt', rank_mode='bm25', docs_limit=50, positional_flag=False): self.tokenizer = Tokenizer(tokenizer_mode, stopwords_file) self.indexer = Indexer(positional_flag=positional_flag) self.ranker = Ranker(queries_path=queries_path, mode=rank_mode, docs_limit=docs_limit) self.file = file # defines the number of lines to be read at once self.chunksize = chunksize self.block_number = 0 # used in bm25 to check each documents length, and the average of all docs self.docs_length = {} # collection size self.collection_size = 0
def scheduled_job(): """ This job is run every monday at 12. """ now = datetime.datetime.now() podcasts = Ranker('internet-tecnologia',445,5).build() Storage.save('storage/ranking_{0}-{1}-{2}.json'.format(now.year,now.strftime('%m'),now.strftime('%d')), podcasts)
def __init__(self, parser, output_path, stem): """ :param inverted_index: dictionary of inverted index """ self.parser = parser self.ranker = Ranker(output_path, stem) self.path = output_path self.counter = 1 self.stem = stem self.lda_model = None self.dictionary = None self.dict = None self.documents = None self.docslen = 0 self.documentfilenames = { 'zero_documents': 0, 'first_documents': 0, 'second_documents': 0, 'third_documents': 0, 'fourth_documents': 0, 'fifth_documents': 0, 'sixth_documents': 0, 'seventh_documents': 0, 'eighth_documents': 0, 'ninth_documens': 0 }
def __init__(self): self.du = DU() self.vocab, self.recab = self.du.initialize_vocabulary() self.ids_arr = [] for line in open(self.du.ids_path): line = line.strip() if len(line) > 0: temp = line.split(' ') for i in range(len(temp)): temp[i] = int(temp[i]) self.ids_arr.append(temp) else: self.ids_arr.append([]) self.mark = json.load(open(self.du.mark_path)) self.train = json.load(open(self.du.train_path)) self.dev = json.load(open(self.du.dev_path)) self.test = json.load(open(self.du.test_path)) self.model = Ranker( vocab_size=FLAGS.vocab_size, embedding_size=FLAGS.emd_size, memory_size=FLAGS.mem_size, batch_size=FLAGS.batch_size, max_dialogue_size=FLAGS.max_dialogue_size, max_sentence_size=FLAGS.max_sentence_size, margin=FLAGS.margin, max_gradient_norm=FLAGS.max_gradient_norm, learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor, use_lstm=False, train_mode=FLAGS.train, # drop_out = FLAGS.drop_out, # layer = FLAGS.layer )
def __init__(self, inverted_index): """ :param inverted_index: dictionary of inverted index """ self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index
def train(): print('Preprocessing raw data') preprocessor = Preprocessor() preprocessor.preprocess() dataset = Dataset(preprocessor) print('Training MF') mf = MF(preprocessor, dataset) mf.train_or_load_if_exists() print('Building I2I') i2i = Item2Item(dataset) print('Generating candidates') candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) X_train, y_train, q_train, q_train_reader = candidate_generator.generate_train() X_val, y_val, q_val, q_val_reader = candidate_generator.generate_val() import pickle try: with open('puke.pkl', 'wb') as f: pickle.dump((X_train, y_train, q_train, q_train_reader, X_val, y_val, q_val, q_val_reader), f) except: print("Couldn't save puke") print('Training ranker') ranker = Ranker() ranker.train(X_train, y_train, q_train, X_val, y_val, q_val) ranker.save() print('Validating ranker') rank_scores = ranker.rank(X_val) print('ndcg', dataset.validate_ndcg(y_val, q_val, q_val_reader, rank_scores))
def __init__(self, config=None): self._config = config # self._parser = Parse() self._parser = Parse(self._config) self._indexer = Indexer(self._config) self._ranker = Ranker() self._model = None
def __init__(self, parser, indexer, model=None, model_1=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self._model_1 = model_1 self.spellcheck = Spell_check()
def __init__(self, inverted_index, config=None): """ :param inverted_index: dictionary of inverted index """ #self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index self.config = config
def __init__(self, parser, indexer, model=None, wordnet=False, correction=False): self._parser = parser self.indexer = indexer self._ranker = Ranker() self._model = model # method toggles self.wordnet_toggle = wordnet self.spelling_corr_toggle = correction
def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self._config = self._indexer.config self._method_class = None
def __init__(self, inverted_index, stemming, word2vec): """ :param inverted_index: dictionary of inverted index """ self.parser = Parse(stemming) self.ranker = Ranker() self.inverted_index = inverted_index self.word2vec = word2vec
def __init__(self): super().__init__() self.model_lm = LanguageModel() self.model_ct = ContentTransfer() self.kb = KnowledgeBase() self.ranker = Ranker(self.model_lm) self.local = True
def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer # indexer_dic = indexer.load_index("idx_bench.pkl") indexer_dic = indexer.load_index("idx.pkl") # TODO - this we need to submit if "tweet_dic" in indexer_dic: self._ranker = Ranker(indexer_dic["posting"], indexer_dic["docs"], indexer_dic["tweet_dic"]) else: self._ranker = Ranker(indexer_dic["posting"], indexer_dic["docs"]) self._model = model self.posting_dic = indexer_dic["posting"] self.invert_dic = indexer_dic["invert"] self.doc_dic = indexer_dic["docs"] if "word2vec" in indexer_dic and model is not None: self.word2vec = True else: self.word2vec = False if "global" in indexer_dic: self.Sij_dic = indexer_dic["global"] else: self.Sij_dic = None if "wordnet" in indexer_dic: self.word_net = True else: self.word_net = False if "local" in indexer_dic: self.local = True else: self.local = False if "spellChecker" in indexer_dic: self.spellcheck = True else: self.spellcheck = False self.relevant_docs = {} self.counter_of_terms = {} self.unique_tweets_num = set()
def __init__(self, parser, indexer, model=None): # self._model = model self.parser = parser self.ranker = Ranker(indexer.tweet_info) self.inverted_index = indexer.inverted_idx self.firstUnion = True self.posting_dir = ConfigClass.get_output() self.DocsToRetrieve = ConfigClass.numOfDocsToRetrieve self.scoreLowerBoundFactor = 0.5
def add_all_wordstarts_matching(self, hits, query, max_hits): lower_query = query.lower() if lower_query in self.basenames_by_wordstarts: ranker = Ranker() for basename in self.basenames_by_wordstarts[lower_query]: rank = ranker.rank(query, basename) hits[basename] = rank if len(hits) >= max_hits: return
def __init__(self, parser, indexer, config, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker(config) self._model = model self._the_count = config.the_count self._wordnet_count = config.wordnet_count self._min_relevant = config.min_relevant self._ext_val = config.ext_val
def inference(): preprocessor = Preprocessor(first_time=False) preprocessor.preprocess() dataset = Dataset(preprocessor) mf = MF(preprocessor, dataset) mf.load() i2i = Item2Item(dataset) candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) ranker = Ranker() ranker.load() X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit() try: with open('submit_puke.pkl', 'wb') as f: pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f) except: print("Couldn't save submit_puke") # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb')) rank_scores = ranker.rank(X_submit) base = 0 entire_articles = [] not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items) not_heavy_items = sorted(not_heavy_items) cut = 50 random.seed(0) with result_path.open('w') as fout: for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)): articles = X_article_nums[base:base+group_size] scores = rank_scores[base:base+group_size] articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)] articles = articles[:cut] from_followable = candidate_generator.get_readers_followable_articles(reader) # from_keywords = candidate_generator.get_readers_keyword_articles(reader) for item in from_followable: if len(articles) >= cut + 15: break if item in articles: continue articles.append(item) while len(articles) < 100: item = random.choice(not_heavy_items) if item not in articles: articles.append(item) entire_articles.extend(articles) reader_str = preprocessor.num2reader[reader] article_strs = map(preprocessor.num2article.get, articles) fout.write('%s %s\n' % (reader_str, ' '.join(article_strs))) base += group_size print('Entropy of candidates = ', entropy(entire_articles))
def __init__(self): super().__init__() self.model_mrc = BidafQA() self.model_cmr = ConversingByReading() self.model_open = DialoGPT() self.kb = KnowledgeBase() model_mmi = DialoGPT(path_model='models/DialoGPT/small_reverse.pkl') self.ranker = Ranker(self.model_open, model_mmi) self.local = True
def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self.number_of_docs = 0 self.upper_limit = 2000 self.inverted_index = self._indexer.get_inverted_index() self.docs_index = self._indexer.get_docs_index() Ranker.avdl = self._indexer.total_docs_len / self._indexer.get_docs_count()
def __init__(self, inverted_index, path): """ :param inverted_index: dictionary of inverted index """ self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index self.path = path self.global_method = GlobalMethod(inverted_index, path) self.global_method.execute_global_method_and_generate_matrix()
def __init__(self, inverted_index, corpus_size, average_length, output_path): """ :param inverted_index: dictionary of inverted index """ self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index self.corpus_size = corpus_size self.average_length = average_length self.output_path = output_path
def __init__(self, inverted_index, tweet_dict): """ :param inverted_index: dictionary of inverted index """ self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index self.tweet_dict = tweet_dict self.avg_tweet_length = tweet_dict["metadata"]["avgLength"] self.max_referrals = tweet_dict["metadata"]["maxReferrals"] self.min_timestamp = tweet_dict["metadata"]["minTimestamp"] self.max_timestamp = tweet_dict["metadata"]["maxTimestamp"]
def __init__(self, inverted_index, document_dict, n, avg_length_per_doc, glove_dict, config): """ :param inverted_index: dictionary of inverted index """ self.ranker = Ranker(avg_length_per_doc, document_dict, config) self.inverted_index = inverted_index self.document_dict = document_dict self.term_to_doclist = {} self.number_of_documents = n self.glove_dict = glove_dict self.config = config