import gridfs import DeepLearning as dl from DeepLearning.database import MongoLoadDocumentMeta, MongoLoadDocumentData import sys ''' Configurations ''' language = 'english' tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') stop_set = nltk.corpus.stopwords.words(language) stemmer = gensim.parsing.PorterStemmer() mongodb = MongoLoadDocumentMeta('patents') documents = mongodb.get_all_meta('testing_docs100') corpus = MongoLoadDocumentData('patents', documents, clean_text=True, tokenizer=tokenizer, stop_set=stop_set, description=True) word2vec_model = dl.learn.Word2VecTrainer().load_model('../word2vec.model') word_vector_generator = dl.data_representation.Word2VecEmbeddingCreator( word2vec_model, maxWords=200, embeddingSize=200) for document in documents: print(document['filename'], document['ipc_classes']) content = corpus.get_file_content(document['filename']) content = corpus.clean(content['description']) word_embedding_matrix = word_vector_generator.create_x_text(content) client = pymongo.MongoClient() patents_database = client.patents word_embedding_collection = patents_database.testing_embedding_docs100
import gensim import nltk import logging from DeepLearning import database, learn from DeepLearning.database import MongoLoadDocumentMeta, MongoLoadDocumentData logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sg = 1 ''' Configurations ''' language = 'english' tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') stop_set = nltk.corpus.stopwords.words(language) stemmer = gensim.parsing.PorterStemmer() mongodb = MongoLoadDocumentMeta('patents') documents = mongodb.get_all_meta('doc2vec_docs') all_corpus = MongoLoadDocumentData('patents', documents, clean_text=True, tokenizer=tokenizer, stop_set=stop_set, abstract=True, description=True, claims=True, doc2vec_doc=True) # for c in all_corpus: # print(c) doc2vecTrainer = learn.Doc2VecTrainer(iter=20, min_alpha=0.0001) doc2vecTrainer.train(all_corpus) doc2vecTrainer.save('doc2vec_mongo.model')