Esempio n. 1
0
def main():
    log.info("*****Patent2Vec Application*****")

    # Preprocess patent documents
    log.info("Preprocessing patent documents")
    patents = PatentDocument(SOURCE_DATASET,
                             extension="",
                             use_conceptualizer=True,
                             transform_conceptualizer=True,
                             enable_sampling=True,
                             train_ratio=1.0,
                             test_ratio=0.0)

    # Create Patent2Vec model
    models = OrderedDict()

    # PV-DM with average
    models["PV_DM_Mean"] = \
        Patent2Vec(dm=1, dm_mean=1, dm_concat=0, min_word_count=5, size=500,
                   context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
                   use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    models["PV_DM_Mean"].build(patents)
    models["PV_DM_Mean"].intersect_with_pretrained_embedding(
        PRETRAINED_EMBEDDING, binary=False)
    #     models["PV_DM_Mean"].load(PATENT2VEC_MODEL)

    #     # PV-DM with concatenation
    #     models["PV_DM_Concatenation"] = \
    #         Patent2Vec(dm=1, dm_mean=0, dm_concat=1, min_word_count=5, size=500,
    #                    context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
    #                    use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    #     models["PV_DM_Concatenation"].reuse_from(models["PV_DM_Mean"])
    # #     models["PV_DM_Concatenation"].build(patents)
    # #     models["PV_DM_Concatenation"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING,
    # #                                                                       binary=False)
    # # #     models["PV_DM_Concatenation"].load(PATENT2VEC_MODEL)

    #     # PV-DBOW
    #     models["PV_DBOW"] = \
    #         Patent2Vec(dm=0, dm_mean=0, dm_concat=0, min_word_count=5, size=500,
    #                    context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
    #                    use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    #     models["PV_DBOW"].reuse_from(models["PV_DM_Mean"])
    # #     models["PV_DBOW"].build(patents)
    # #     models["PV_DBOW"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING,
    # #                                                           binary=False)
    # # #     models["PV_DBOW"].load(PATENT2VEC_MODEL)

    #     # Mixed models
    #     models["DBOW + DM with average"] = ConcatenatedPatent2Vec([models["PV_DBOW"],
    #                                                                models["PV_DM_Mean"]])
    #     models["DBOW + DM with concatenation"] = ConcatenatedPatent2Vec([models["PV_DBOW"],
    #                                                                      models["PV_DM_Concatenation"]])

    for name, model in models.items():
        # Train Patent2Vec model
        start_time = time.time()
        model.train(patents,
                    alpha=0.1,
                    min_alpha=0.0001,
                    passes=10,
                    fixed_alpha=False)
        end_time = time.time()
        log.info("Total time elapsed: %r", (end_time - start_time))

        # Evaluate Patent2Vec model
        model.evaluate()

        # Save Patent2Vec model
        model.save(model=PATENT2VEC_MODEL)

        # Create a database object
        db = Database(verbose=True)

        # Connect to database
        db.connect(in_memory=True)

        # Create a new table for storing document embeddings
        db.create_table(table=PATENT_EMBEDDING_TABLE,
                        primary_column=PRIMARY_KEY,
                        other_columns=FIELDS)

        # Save document embeddings
        model.save_document_embeddings(document_embeddings=PATENT_EMBEDDING,
                                       rows=len(patents),
                                       columns=500,
                                       database=db,
                                       table_name=PATENT_EMBEDDING_TABLE,
                                       save_patent_category=True,
                                       prepend_document_category=True)

        # Test documents
        if not os.path.exists(TESTING_DATA):
            raise PathNotFoundError("Path does not exist: %s" % TESTING_DATA)

        with open(TESTING_DATA, "r") as t:
            test_documents = t.readlines()
            test_documents = map(lambda x: x.strip(), test_documents)
            test_documents = filter(None, test_documents)

        # Preprocessed test documents
        preprocessed_test_documents = patents.get_preprocessed_corpus(
            test_documents)

        # Predict document embeddings
        model.predict(preprocessed_test_documents,
                      alpha=0.1,
                      min_alpha=0.0001,
                      steps=50,
                      save=True,
                      database=db,
                      table_name=PATENT_EMBEDDING_TABLE,
                      save_patent_category=True,
                      prepend_document_category=True)

        # Create an index on document embedding table
        db.create_index(index=PATENT_EMBEDDING_INDEX,
                        table=PATENT_EMBEDDING_TABLE,
                        index_by_column=PRIMARY_KEY[0])

        # Close database connection
        db.close(save_to=PATENT_EMBEDDING_DATABASE)

        # Delete temporary training data
        model.clean()

    # Test document for checking the quality of Patent2Vec model
    patents.set_token_only(True)
    preprocessed_test_document = patents.get_preprocessed_document(
        TEST_DOCUMENT)
    patents.set_token_only(False)

    # Check quality of Patent2Vec model
    if preprocessed_test_document is not None:
        log.info("Check quality of Patent2Vec model")
        log.info("Top matches for test document: %s", TEST_DOCUMENT)

        for name, model in models.items():
            embedding = model.infer(preprocessed_test_document)

            top_matches = model.model.docvecs.most_similar(
                positive=[embedding], negative=[], topn=10)
            top_matches = map(lambda x: x[0] + "\t\t" + str(x[1]), top_matches)

            for top_match in top_matches:
                log.info(top_match)

    # Clean all un-necessary files
    clean(cleanSample=True,
          cleanModel=False,
          cleanDocvecs=True,
          cleanDatabase=False,
          cleanClusters=False,
          filter=[])