Example #1
0
def main():
    experiment_set = final_experiment
    print("There are {} experiments to run".format(len(experiment_set)))
    train_data_path = "data/training.dat"
    dev_data_path = "data/full/dev.dat"
    tst_data_path = "data/full/evaluation.dat"
    feats_path = "data/model.features"
    num_feats = len([line for line in open(feats_path)])
    batch_size = 80
    runs_per_experiment = 5

    for experiment_name in experiment_set.keys():
        logger.info("Running experiment {}".format(experiment_name))
        exp_features = experiment_set[experiment_name]
        out_path = 'output/experiments_v3/{}'.format(experiment_name)
        makedirs(out_path, exist_ok=True)
        train_instances = load_data(train_data_path, num_feats, exp_features)
        dev_instances = load_data(dev_data_path, num_feats, exp_features)
        dev_eval_instances = load_eval_data(dev_data_path, num_feats,
                                            exp_features)
        tst_instances = load_eval_data(tst_data_path, num_feats, exp_features)
        logger.info("Loaded {} training instances with {} features".format(
            len(train_instances), num_feats))
        for i in range(runs_per_experiment):
            iter_path = out_path + '/v{}'.format(i)
            makedirs(iter_path, exist_ok=True)
            ranker = Ranker(num_feats, 256)
            trainer = RankerTrainer(ranker, batch_size, iter_path)
            trainer.train(train_instances, dev_instances, None,
                          dev_eval_instances, tst_instances)
 def __init__(self, docs, path):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.ranker = Ranker(self)
     self.term_dict,self.document_dict = docs
     self.POSTING_PATH = path
Example #3
0
 def __init__(self, parser, indexer, model=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker()
     self._model = model
     self.terms_searched = {}
     self.total_num_of_docs = parser.curr_idx
Example #4
0
def main(args):
    torch.manual_seed(333)
    if use_cuda:
        torch.cuda.manual_seed(333)
    random.seed(333)
    train_data_path = "data/training.dat"
    train_eval_data_path = "data/train-eval.dat"
    dev_data_path = "data/full/dev.dat"
    eval_data_path = "data/full/evaluation.dat"
    feats_path = "data/model.features"
    num_feats = len([line for line in open(feats_path)])
    batch_size = 80
    ranker = Ranker(num_feats, 256)
    ## Instances for training - loaded as pairs
    feat_indices = set([i for i in range(num_feats)])
    train_instances = load_data(train_data_path, num_feats, feat_indices)
    train_eval_instances = load_eval_data(train_data_path, num_feats,
                                          feat_indices)
    dev_instances = load_data(dev_data_path, num_feats, feat_indices)
    dev_eval_instances = load_eval_data(dev_data_path, num_feats, feat_indices)
    tst_instances = load_eval_data(eval_data_path, num_feats, feat_indices)
    logger.info("Loaded {} training instances with {} features".format(
        len(train_instances), num_feats))
    trainer = RankerTrainer(ranker, batch_size, 'output/')
    trainer.train(train_instances, dev_instances, train_eval_instances,
                  dev_eval_instances, tst_instances)
    ranker.save('output/ranker.model')
Example #5
0
 def __init__(self, inverted_index, posting_file=None):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.ranker = Ranker()
     self.inverted_index = inverted_index
     self.posting_file = posting_file
Example #6
0
 def __init__(self, parser, indexer, model=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker()
     self._model = model
     self._docs_dict = {}
     self.number_of_documents = len(indexer.docs_dict)
Example #7
0
    def __init__(self,
                 tokenizer_mode,
                 file='../content/metadata.csv',
                 stopwords_file="../content/snowball_stopwords_EN.txt",
                 chunksize=10000,
                 queries_path='../content/queries.txt',
                 rank_mode='bm25',
                 docs_limit=50,
                 positional_flag=False):
        self.tokenizer = Tokenizer(tokenizer_mode, stopwords_file)
        self.indexer = Indexer(positional_flag=positional_flag)
        self.ranker = Ranker(queries_path=queries_path,
                             mode=rank_mode,
                             docs_limit=docs_limit)
        self.file = file

        # defines the number of lines to be read at once
        self.chunksize = chunksize
        self.block_number = 0

        # used in bm25 to check each documents length, and the average of all docs
        self.docs_length = {}

        # collection size
        self.collection_size = 0
def scheduled_job():
    """
        This job is run every monday at 12.
    """
    now = datetime.datetime.now()
    podcasts = Ranker('internet-tecnologia',445,5).build()
    Storage.save('storage/ranking_{0}-{1}-{2}.json'.format(now.year,now.strftime('%m'),now.strftime('%d')), podcasts)
Example #9
0
 def __init__(self, parser, output_path, stem):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.parser = parser
     self.ranker = Ranker(output_path, stem)
     self.path = output_path
     self.counter = 1
     self.stem = stem
     self.lda_model = None
     self.dictionary = None
     self.dict = None
     self.documents = None
     self.docslen = 0
     self.documentfilenames = {
         'zero_documents': 0,
         'first_documents': 0,
         'second_documents': 0,
         'third_documents': 0,
         'fourth_documents': 0,
         'fifth_documents': 0,
         'sixth_documents': 0,
         'seventh_documents': 0,
         'eighth_documents': 0,
         'ninth_documens': 0
     }
Example #10
0
    def __init__(self):
        self.du = DU()
        self.vocab, self.recab = self.du.initialize_vocabulary()
        self.ids_arr = []
        for line in open(self.du.ids_path):
            line = line.strip()
            if len(line) > 0:
                temp = line.split(' ')
                for i in range(len(temp)):
                    temp[i] = int(temp[i])
                self.ids_arr.append(temp)
            else:
                self.ids_arr.append([])

        self.mark = json.load(open(self.du.mark_path))
        self.train = json.load(open(self.du.train_path))
        self.dev = json.load(open(self.du.dev_path))
        self.test = json.load(open(self.du.test_path))

        self.model = Ranker(
            vocab_size=FLAGS.vocab_size,
            embedding_size=FLAGS.emd_size,
            memory_size=FLAGS.mem_size,
            batch_size=FLAGS.batch_size,
            max_dialogue_size=FLAGS.max_dialogue_size,
            max_sentence_size=FLAGS.max_sentence_size,
            margin=FLAGS.margin,
            max_gradient_norm=FLAGS.max_gradient_norm,
            learning_rate=FLAGS.learning_rate,
            learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
            use_lstm=False,
            train_mode=FLAGS.train,
            #				drop_out 	   = FLAGS.drop_out,
            #				layer		   = FLAGS.layer
        )
Example #11
0
 def __init__(self, inverted_index):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.parser = Parse()
     self.ranker = Ranker()
     self.inverted_index = inverted_index
def train():
    print('Preprocessing raw data')
    preprocessor = Preprocessor()
    preprocessor.preprocess()

    dataset = Dataset(preprocessor)

    print('Training MF')
    mf = MF(preprocessor, dataset)
    mf.train_or_load_if_exists()

    print('Building I2I')
    i2i = Item2Item(dataset)

    print('Generating candidates')
    candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i)
    X_train, y_train, q_train, q_train_reader = candidate_generator.generate_train()
    X_val, y_val, q_val, q_val_reader = candidate_generator.generate_val()

    import pickle
    try:
        with open('puke.pkl', 'wb') as f:
            pickle.dump((X_train, y_train, q_train, q_train_reader,
                         X_val, y_val, q_val, q_val_reader), f)
    except:
        print("Couldn't save puke")

    print('Training ranker')
    ranker = Ranker()
    ranker.train(X_train, y_train, q_train, X_val, y_val, q_val)
    ranker.save()

    print('Validating ranker')
    rank_scores = ranker.rank(X_val)
    print('ndcg', dataset.validate_ndcg(y_val, q_val, q_val_reader, rank_scores))
Example #13
0
 def __init__(self, config=None):
     self._config = config
     # self._parser = Parse()
     self._parser = Parse(self._config)
     self._indexer = Indexer(self._config)
     self._ranker = Ranker()
     self._model = None
Example #14
0
 def __init__(self, parser, indexer, model=None, model_1=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker()
     self._model = model
     self._model_1 = model_1
     self.spellcheck = Spell_check()
Example #15
0
 def __init__(self, inverted_index, config=None):
     """
     :param inverted_index: dictionary of inverted index
     """
     #self.parser = Parse()
     self.ranker = Ranker()
     self.inverted_index = inverted_index
     self.config = config
Example #16
0
 def __init__(self, parser, indexer, model=None, wordnet=False, correction=False):
     self._parser = parser
     self.indexer = indexer
     self._ranker = Ranker()
     self._model = model
     # method toggles
     self.wordnet_toggle = wordnet
     self.spelling_corr_toggle = correction
Example #17
0
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer

        self._ranker = Ranker()
        self._model = model
        self._config = self._indexer.config
        self._method_class = None
Example #18
0
 def __init__(self, inverted_index, stemming, word2vec):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.parser = Parse(stemming)
     self.ranker = Ranker()
     self.inverted_index = inverted_index
     self.word2vec = word2vec
Example #19
0
    def __init__(self):
        super().__init__()

        self.model_lm = LanguageModel()
        self.model_ct = ContentTransfer()
        self.kb = KnowledgeBase()
        self.ranker = Ranker(self.model_lm)
        self.local = True
Example #20
0
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        # indexer_dic = indexer.load_index("idx_bench.pkl")
        indexer_dic = indexer.load_index("idx.pkl")  # TODO - this we need to submit
        if "tweet_dic" in indexer_dic:
            self._ranker = Ranker(indexer_dic["posting"], indexer_dic["docs"], indexer_dic["tweet_dic"])
        else:
            self._ranker = Ranker(indexer_dic["posting"], indexer_dic["docs"])

        self._model = model

        self.posting_dic = indexer_dic["posting"]
        self.invert_dic = indexer_dic["invert"]
        self.doc_dic = indexer_dic["docs"]

        if "word2vec" in indexer_dic and model is not None:
            self.word2vec = True
        else:
            self.word2vec = False

        if "global" in indexer_dic:
            self.Sij_dic = indexer_dic["global"]
        else:
            self.Sij_dic = None

        if "wordnet" in indexer_dic:
            self.word_net = True
        else:
            self.word_net = False

        if "local" in indexer_dic:
            self.local = True
        else:
            self.local = False

        if "spellChecker" in indexer_dic:
            self.spellcheck = True
        else:
            self.spellcheck = False


        self.relevant_docs = {}
        self.counter_of_terms = {}
        self.unique_tweets_num = set()
Example #21
0
 def __init__(self, parser, indexer, model=None):
     # self._model = model
     self.parser = parser
     self.ranker = Ranker(indexer.tweet_info)
     self.inverted_index = indexer.inverted_idx
     self.firstUnion = True
     self.posting_dir = ConfigClass.get_output()
     self.DocsToRetrieve = ConfigClass.numOfDocsToRetrieve
     self.scoreLowerBoundFactor = 0.5
Example #22
0
 def add_all_wordstarts_matching(self, hits, query, max_hits):
     lower_query = query.lower()
     if lower_query in self.basenames_by_wordstarts:
         ranker = Ranker()
         for basename in self.basenames_by_wordstarts[lower_query]:
             rank = ranker.rank(query, basename)
             hits[basename] = rank
             if len(hits) >= max_hits:
                 return
Example #23
0
 def __init__(self, parser, indexer, config, model=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker(config)
     self._model = model
     self._the_count = config.the_count
     self._wordnet_count = config.wordnet_count
     self._min_relevant = config.min_relevant
     self._ext_val = config.ext_val
def inference():
    preprocessor = Preprocessor(first_time=False)
    preprocessor.preprocess()
    dataset = Dataset(preprocessor)
    mf = MF(preprocessor, dataset)
    mf.load()
    i2i = Item2Item(dataset)
    candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i)
    ranker = Ranker()
    ranker.load()

    X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit()
    try:
        with open('submit_puke.pkl', 'wb') as f:
            pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f)
    except:
        print("Couldn't save submit_puke")

    # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb'))

    rank_scores = ranker.rank(X_submit)
    base = 0
    entire_articles = []
    not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items)
    not_heavy_items = sorted(not_heavy_items)
    cut = 50

    random.seed(0)
    with result_path.open('w') as fout:
        for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)):
            articles = X_article_nums[base:base+group_size]
            scores = rank_scores[base:base+group_size]

            articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)]
            articles = articles[:cut]
            from_followable = candidate_generator.get_readers_followable_articles(reader)
            # from_keywords = candidate_generator.get_readers_keyword_articles(reader)
            for item in from_followable:
                if len(articles) >= cut + 15:
                    break
                if item in articles:
                    continue
                articles.append(item)
            while len(articles) < 100:
                item = random.choice(not_heavy_items)
                if item not in articles:
                    articles.append(item)
            entire_articles.extend(articles)

            reader_str = preprocessor.num2reader[reader]
            article_strs = map(preprocessor.num2article.get, articles)

            fout.write('%s %s\n' % (reader_str, ' '.join(article_strs)))

            base += group_size
    print('Entropy of candidates = ', entropy(entire_articles))
Example #25
0
    def __init__(self):
        super().__init__()

        self.model_mrc = BidafQA()
        self.model_cmr = ConversingByReading()
        self.model_open = DialoGPT()
        self.kb = KnowledgeBase()
        model_mmi = DialoGPT(path_model='models/DialoGPT/small_reverse.pkl')
        self.ranker = Ranker(self.model_open, model_mmi)
        self.local = True
Example #26
0
 def __init__(self, parser, indexer, model=None):
     self._parser = parser
     self._indexer = indexer
     self._ranker = Ranker()
     self._model = model
     self.number_of_docs = 0
     self.upper_limit = 2000
     self.inverted_index = self._indexer.get_inverted_index()
     self.docs_index = self._indexer.get_docs_index()
     Ranker.avdl = self._indexer.total_docs_len / self._indexer.get_docs_count()
Example #27
0
 def __init__(self, inverted_index, path):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.parser = Parse()
     self.ranker = Ranker()
     self.inverted_index = inverted_index
     self.path = path
     self.global_method = GlobalMethod(inverted_index, path)
     self.global_method.execute_global_method_and_generate_matrix()
Example #28
0
 def __init__(self, inverted_index, corpus_size, average_length,
              output_path):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.parser = Parse()
     self.ranker = Ranker()
     self.inverted_index = inverted_index
     self.corpus_size = corpus_size
     self.average_length = average_length
     self.output_path = output_path
Example #29
0
 def __init__(self, inverted_index, tweet_dict):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.parser = Parse()
     self.ranker = Ranker()
     self.inverted_index = inverted_index
     self.tweet_dict = tweet_dict
     self.avg_tweet_length = tweet_dict["metadata"]["avgLength"]
     self.max_referrals = tweet_dict["metadata"]["maxReferrals"]
     self.min_timestamp = tweet_dict["metadata"]["minTimestamp"]
     self.max_timestamp = tweet_dict["metadata"]["maxTimestamp"]
Example #30
0
 def __init__(self, inverted_index, document_dict, n, avg_length_per_doc,
              glove_dict, config):
     """
     :param inverted_index: dictionary of inverted index
     """
     self.ranker = Ranker(avg_length_per_doc, document_dict, config)
     self.inverted_index = inverted_index
     self.document_dict = document_dict
     self.term_to_doclist = {}
     self.number_of_documents = n
     self.glove_dict = glove_dict
     self.config = config