コード例 #1
0
import gridfs
import DeepLearning as dl
from DeepLearning.database import MongoLoadDocumentMeta, MongoLoadDocumentData
import sys
'''
Configurations
'''
language = 'english'
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stop_set = nltk.corpus.stopwords.words(language)
stemmer = gensim.parsing.PorterStemmer()
mongodb = MongoLoadDocumentMeta('patents')
documents = mongodb.get_all_meta('testing_docs100')
corpus = MongoLoadDocumentData('patents',
                               documents,
                               clean_text=True,
                               tokenizer=tokenizer,
                               stop_set=stop_set,
                               description=True)

word2vec_model = dl.learn.Word2VecTrainer().load_model('../word2vec.model')
word_vector_generator = dl.data_representation.Word2VecEmbeddingCreator(
    word2vec_model, maxWords=200, embeddingSize=200)

for document in documents:
    print(document['filename'], document['ipc_classes'])
    content = corpus.get_file_content(document['filename'])
    content = corpus.clean(content['description'])
    word_embedding_matrix = word_vector_generator.create_x_text(content)
    client = pymongo.MongoClient()
    patents_database = client.patents
    word_embedding_collection = patents_database.testing_embedding_docs100
コード例 #2
0
import gensim
import nltk
import logging

from DeepLearning import database, learn
from DeepLearning.database import MongoLoadDocumentMeta, MongoLoadDocumentData


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sg = 1

'''
Configurations
'''
language = 'english'
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stop_set = nltk.corpus.stopwords.words(language)
stemmer = gensim.parsing.PorterStemmer()
mongodb = MongoLoadDocumentMeta('patents')
documents = mongodb.get_all_meta('doc2vec_docs')
all_corpus = MongoLoadDocumentData('patents', documents, clean_text=True, tokenizer=tokenizer, stop_set=stop_set, abstract=True, description=True, claims=True, doc2vec_doc=True)

# for c in all_corpus:
#     print(c)

doc2vecTrainer = learn.Doc2VecTrainer(iter=20, min_alpha=0.0001)
doc2vecTrainer.train(all_corpus)
doc2vecTrainer.save('doc2vec_mongo.model')