def inference():
    preprocessor = Preprocessor(first_time=False)
    preprocessor.preprocess()
    dataset = Dataset(preprocessor)
    mf = MF(preprocessor, dataset)
    mf.load()
    i2i = Item2Item(dataset)
    candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i)
    ranker = Ranker()
    ranker.load()

    X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit()
    try:
        with open('submit_puke.pkl', 'wb') as f:
            pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f)
    except:
        print("Couldn't save submit_puke")

    # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb'))

    rank_scores = ranker.rank(X_submit)
    base = 0
    entire_articles = []
    not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items)
    not_heavy_items = sorted(not_heavy_items)
    cut = 50

    random.seed(0)
    with result_path.open('w') as fout:
        for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)):
            articles = X_article_nums[base:base+group_size]
            scores = rank_scores[base:base+group_size]

            articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)]
            articles = articles[:cut]
            from_followable = candidate_generator.get_readers_followable_articles(reader)
            # from_keywords = candidate_generator.get_readers_keyword_articles(reader)
            for item in from_followable:
                if len(articles) >= cut + 15:
                    break
                if item in articles:
                    continue
                articles.append(item)
            while len(articles) < 100:
                item = random.choice(not_heavy_items)
                if item not in articles:
                    articles.append(item)
            entire_articles.extend(articles)

            reader_str = preprocessor.num2reader[reader]
            article_strs = map(preprocessor.num2article.get, articles)

            fout.write('%s %s\n' % (reader_str, ' '.join(article_strs)))

            base += group_size
    print('Entropy of candidates = ', entropy(entire_articles))
    def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs):
        '''
        Constructs spider instance from command=line or scrapyd daemon.

        :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage
        :param save_html: boolean 0/1
        :param use_splash: boolean 0/1
        :param screenshot_dir: used only when use_splash=1
        :param op_time: operating time in minutes, negative - don't use that constraint
        :param kwargs:
        :return:
        '''
        super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs)
        self.screenshot_dir = screenshot_dir
        log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG)

        if seed_urls:
            self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')]
        self.ranker = Ranker.load()
        self.linkextractor = LinkExtractor()
        self.save_html = bool(save_html)
        self.use_splash = bool(use_splash)
        self.operating_time = int(op_time) * 60

        self.start_time = datetime.utcnow()
        self.finishing = False
Example #3
0
def train_and_score_mongo():
    """ Rescore all items from mongo """
    
    print "**************Training*********************"
    train_on_user_input()


    print "**************Scoring and Indexing*****************"
    mmu = MemexMongoUtils()
    docs = mmu.list_all_urls_iterator(return_html = True)

    ranker = Ranker.load()
    for doc in tqdm(docs, leave = True):
        try:
            score = ranker.score_doc(doc)
        except:
            score = 0

        mmu.set_score(doc["url"], score)

    _score_hosts()
    def __init__(
            self,
            seed_urls=None,
            save_html=1,
            use_splash=1,
            screenshot_dir='/memex-pinterest/ui/static/images/screenshots',
            op_time=10,
            **kwargs):
        '''
        Constructs spider instance from command=line or scrapyd daemon.

        :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage
        :param save_html: boolean 0/1
        :param use_splash: boolean 0/1
        :param screenshot_dir: used only when use_splash=1
        :param op_time: operating time in minutes, negative - don't use that constraint
        :param kwargs:
        :return:
        '''
        super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir,
                                            **kwargs)
        self.screenshot_dir = screenshot_dir
        log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir),
                _level=log.DEBUG)

        if seed_urls:
            self.start_urls = [
                add_scheme_if_missing(url) for url in seed_urls.split(',')
            ]
        self.ranker = Ranker.load()
        self.linkextractor = LinkExtractor()
        self.save_html = bool(save_html)
        self.use_splash = bool(use_splash)
        self.operating_time = int(op_time) * 60

        self.start_time = datetime.utcnow()
        self.finishing = False