def main():
    log.info("*****Clustering Application*****")

    # Create a model for clustering patents
    model = Clustering(method="rbr",
                       criterion="i2",
                       similarity="cos",
                       cluster_choice="best",
                       rowmodel="none",
                       colmodel="none",
                       trials=10,
                       showfeatures=False,
                       showsummaries=True,
                       summary_method="cliques",
                       showtree=False,
                       zscores=False,
                       plotclusters=True,
                       plotformat="ps")

    # Create an object of 'Database'
    db = Database(verbose=True)

    # Connect to SQLite database
    db.connect(in_memory=True, load_from=PATENT_EMBEDDING_DATABASE)

    # Dummy document collection
    documents = []
    for root, folders, files in os.walk(config.CLUSTERING_BENCHMARK_DATA):
        for file in files:
            if not file.startswith('.'):
                if file.endswith(""):
                    document_name = file
                    document_category = root.rsplit(os.sep, 1)[1]
                    document_label = document_category + "." + document_name
                    documents.append(document_label)

    # Generate matrix of document embeddings
    model.patent2mat(documents,
                     rows=len(documents),
                     columns=300,
                     database=db,
                     search_on=PRIMARY_KEY,
                     matrix=PATENT_MATRIX,
                     labels=LABELS,
                     classes=CLASSES,
                     path=PATENT_CLUSTERING_PATH)

    # Close connection to SQLite database
    db.close()

    # Cluster documents
    model.train(matrix=PATENT_MATRIX,
                labels=LABELS,
                classes=CLASSES,
                use_patent_classes=True,
                k=20,
                iterations=20,
                patent_clusters=PATENT_CLUSTER,
                plot=PATENT_CLUSTER_PLOT,
                path=PATENT_CLUSTERING_PATH)

    # Clean all un-necessary files
    clean(
        cleanSample=True,
        cleanModel=False,
        cleanDocvecs=True,
        cleanDatabase=False,
        cleanClusters=True,
        filter=["PatentCluster", "PatentCluster.ps", "PatentEmbedding.rclass"])
Esempio n. 2
0
def main():
    log.info("*****Patent2Vec Application*****")

    # Preprocess patent documents
    log.info("Preprocessing patent documents")
    patents = PatentDocument(SOURCE_DATASET,
                             extension="",
                             use_conceptualizer=True,
                             transform_conceptualizer=True,
                             enable_sampling=True,
                             train_ratio=1.0,
                             test_ratio=0.0)

    # Create Patent2Vec model
    models = OrderedDict()

    # PV-DM with average
    models["PV_DM_Mean"] = \
        Patent2Vec(dm=1, dm_mean=1, dm_concat=0, min_word_count=5, size=500,
                   context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
                   use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    models["PV_DM_Mean"].build(patents)
    models["PV_DM_Mean"].intersect_with_pretrained_embedding(
        PRETRAINED_EMBEDDING, binary=False)
    #     models["PV_DM_Mean"].load(PATENT2VEC_MODEL)

    #     # PV-DM with concatenation
    #     models["PV_DM_Concatenation"] = \
    #         Patent2Vec(dm=1, dm_mean=0, dm_concat=1, min_word_count=5, size=500,
    #                    context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
    #                    use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    #     models["PV_DM_Concatenation"].reuse_from(models["PV_DM_Mean"])
    # #     models["PV_DM_Concatenation"].build(patents)
    # #     models["PV_DM_Concatenation"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING,
    # #                                                                       binary=False)
    # # #     models["PV_DM_Concatenation"].load(PATENT2VEC_MODEL)

    #     # PV-DBOW
    #     models["PV_DBOW"] = \
    #         Patent2Vec(dm=0, dm_mean=0, dm_concat=0, min_word_count=5, size=500,
    #                    context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
    #                    use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    #     models["PV_DBOW"].reuse_from(models["PV_DM_Mean"])
    # #     models["PV_DBOW"].build(patents)
    # #     models["PV_DBOW"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING,
    # #                                                           binary=False)
    # # #     models["PV_DBOW"].load(PATENT2VEC_MODEL)

    #     # Mixed models
    #     models["DBOW + DM with average"] = ConcatenatedPatent2Vec([models["PV_DBOW"],
    #                                                                models["PV_DM_Mean"]])
    #     models["DBOW + DM with concatenation"] = ConcatenatedPatent2Vec([models["PV_DBOW"],
    #                                                                      models["PV_DM_Concatenation"]])

    for name, model in models.items():
        # Train Patent2Vec model
        start_time = time.time()
        model.train(patents,
                    alpha=0.1,
                    min_alpha=0.0001,
                    passes=10,
                    fixed_alpha=False)
        end_time = time.time()
        log.info("Total time elapsed: %r", (end_time - start_time))

        # Evaluate Patent2Vec model
        model.evaluate()

        # Save Patent2Vec model
        model.save(model=PATENT2VEC_MODEL)

        # Create a database object
        db = Database(verbose=True)

        # Connect to database
        db.connect(in_memory=True)

        # Create a new table for storing document embeddings
        db.create_table(table=PATENT_EMBEDDING_TABLE,
                        primary_column=PRIMARY_KEY,
                        other_columns=FIELDS)

        # Save document embeddings
        model.save_document_embeddings(document_embeddings=PATENT_EMBEDDING,
                                       rows=len(patents),
                                       columns=500,
                                       database=db,
                                       table_name=PATENT_EMBEDDING_TABLE,
                                       save_patent_category=True,
                                       prepend_document_category=True)

        # Test documents
        if not os.path.exists(TESTING_DATA):
            raise PathNotFoundError("Path does not exist: %s" % TESTING_DATA)

        with open(TESTING_DATA, "r") as t:
            test_documents = t.readlines()
            test_documents = map(lambda x: x.strip(), test_documents)
            test_documents = filter(None, test_documents)

        # Preprocessed test documents
        preprocessed_test_documents = patents.get_preprocessed_corpus(
            test_documents)

        # Predict document embeddings
        model.predict(preprocessed_test_documents,
                      alpha=0.1,
                      min_alpha=0.0001,
                      steps=50,
                      save=True,
                      database=db,
                      table_name=PATENT_EMBEDDING_TABLE,
                      save_patent_category=True,
                      prepend_document_category=True)

        # Create an index on document embedding table
        db.create_index(index=PATENT_EMBEDDING_INDEX,
                        table=PATENT_EMBEDDING_TABLE,
                        index_by_column=PRIMARY_KEY[0])

        # Close database connection
        db.close(save_to=PATENT_EMBEDDING_DATABASE)

        # Delete temporary training data
        model.clean()

    # Test document for checking the quality of Patent2Vec model
    patents.set_token_only(True)
    preprocessed_test_document = patents.get_preprocessed_document(
        TEST_DOCUMENT)
    patents.set_token_only(False)

    # Check quality of Patent2Vec model
    if preprocessed_test_document is not None:
        log.info("Check quality of Patent2Vec model")
        log.info("Top matches for test document: %s", TEST_DOCUMENT)

        for name, model in models.items():
            embedding = model.infer(preprocessed_test_document)

            top_matches = model.model.docvecs.most_similar(
                positive=[embedding], negative=[], topn=10)
            top_matches = map(lambda x: x[0] + "\t\t" + str(x[1]), top_matches)

            for top_match in top_matches:
                log.info(top_match)

    # Clean all un-necessary files
    clean(cleanSample=True,
          cleanModel=False,
          cleanDocvecs=True,
          cleanDatabase=False,
          cleanClusters=False,
          filter=[])