Beispiel #1
0
class D2VEngine(SearchEngine):
    def __init__(self, max_workers=4):
        super().__init__()
        self.preprocessor = Preprocessor(max_workers=max_workers)

    @classmethod
    def from_configfile(cls):
        profile = environ.get('d2v_profile', 'local')
        config = Config(profile).d2v

        search_engine = cls()
        search_engine.load_model(config['dbow_model_path'])
        return search_engine

    def load_model(self, model_path, dict_path=None):
        self.model = Doc2Vec.load(model_path)

    def search(self, query, limit=50):
        inferred_vector = self._infer(query)
        return self.model.docvecs.most_similar([inferred_vector], topn=limit)

    def dict_search(self, query, limit=100):
        limit = self.search(query, limit=limit)

        query_len = len(query.split(" "))
        return {
            url: self._adjust(query_len, similarity)
            for url, similarity in limit
        }

    def _infer(self, document):
        tokens = self.preprocessor.preprocess_doc(document)
        return self.model.infer_vector(tokens, alpha=0.001, steps=40)

    def _adjust(self, query_length, similarity):
        return similarity / (
            (5 - query_length)**2) if query_length < 4 else similarity
Beispiel #2
0
 def __init__(self, topics, max_workers=4):
     super().__init__()
     self.topics = topics
     self.preprocessor = Preprocessor(max_workers=max_workers)
Beispiel #3
0
def preprocess_tagged_wiki(wiki):
    preprocessor = Preprocessor(None)
    return preprocessor.preprocess_tagged_wiki(wiki)
Beispiel #4
0
def preprocess_tagged_doc(articles, max_workers):
    preprocessor = Preprocessor(max_workers)
    return preprocessor.preproces_tagged_docs_with_urls(articles)
Beispiel #5
0
def preprocess_wiki(wiki, max_workers):
    logger.info("preprocessing {0}".format(wiki))
    preprocessor = Preprocessor(max_workers)
    return preprocessor.process_wiki(wiki)