def with_synonyme_meal(): for i in range(0,len(label_meal_db)): #for i in range(0,3): label_list=label_meal_db[i] label_id=label_list['id'] label=label_list['name'] label_translate_synonymes=translate_synonymes(label) #label_translate_synonymes=label #label_dic.append({'id': 'doc_%i' % label_id, 'tokens': [label_translate_synonymes], 'payload': label_translate_synonymes}) label_dic.append({'id': 'doc_%i' % label_id, 'tokens': cut(label_translate_synonymes), 'payload': label}) logger.info(i) logger.info('label_id= %s' % label_id) ''' for j in range(0,len(mysql_db)): mysql_data_list=mysql_db[j] article_id=mysql_data_list[0] #id article_label=mysql_data_list[1] #label article_title=mysql_data_list[2] #title article_text=mysql_data_list[4] #text if article_title==None: article_title='' if article_text==None: article_text='' article_title_text=article_title+article_text article_title_text_translate_synonymes=translate_synonymes(article_title_text) article_title_text_dic.append({'id': 'doc_%i' % article_id, 'tokens': cut(article_title_text_translate_synonymes), 'payload': article_title_text}) ''' server_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'servers/create_test_withsyn_meal1',) #--model path server = SessionServer(server_path) server.drop_index() #--删除所有索引 utils.upload_chunked(server, label_dic, chunksize=1000) #--simserver分块处理 server.train(label_dic, method='lsi') #--训练已处理后的问题 server.index(label_dic) #--建立索引文件 #print(server.status()) return None
def GensimClient(texts): similarities = None gsDir = os.getcwd() gss = gsDir + os.sep + u"gensim_server" + os.sep server = SessionServer(gss) logger.debug(u"%s" % server.status()) try: corpus = [{ u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text) } for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) similarities = findSimilar(texts, server, corpus) except Exception, msg: logger.debug(u"%s" % msg)
def train_model(service=None): if service == None: raise ValueError( "You should service value in train_model using result from service_initialization.\n" ) corpus = [{ 'id': file_name, 'tokens': utils.simple_preprocess(text) } for file_name, text in _texts.items()] utils.upload_chunked(service, corpus, chunksize=1000) service.train(corpus, method='lsi') service.index(corpus) return
def GensimClient(texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) logger.info(u"%s" % server.status()) corpus = [{u"id": u"url_%i" % n, u"tokens": utils.simple_preprocess(text)} for n, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) # supply a list of document ids to be removed from the index # server.delete(["doc_5", "doc_8"]) # overall index size unchanged (just 3 docs overwritten) server.index(corpus[:3]) # Option Ons for n in range(0, len(corpus)): doc = u"doc_%d" % n logger.info(u"------------------------------------------------------") logger.info(u"Find similar N doc_%d to %s" % (n, corpus[n][u"tokens"])) logger.info(u"------------------------------------------------------") for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: logger.info(u"\t%s \t %3.2f : M %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"])) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u"\t\tCommon Topics : %s" % (lc)) if False: # Option two doc = {u"tokens": utils.simple_preprocess(str("Graph and minors and humans and trees."))} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
def GensimClient(texts): similarities = None gsDir = os.getcwd() gss = gsDir + os.sep + u"gensim_server" + os.sep server = SessionServer(gss) logger.debug(u"%s" % server.status()) try: corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) similarities = findSimilar(texts, server, corpus) except Exception, msg: logger.debug(u"%s" % msg)
print("len(entire_vocab_set) = ", len(entire_vocab_set)) print("i_accum_vocab = ", i_accum_vocab) print("len(doc_list_vocab_set) = ", len(doc_list_vocab_set)) print("len(split_word_set) = ", len(split_word_set)) #print("len(split_word_set) = " , len(split_word_set) ) print("i_accum = ", i_accum) print("len(corpus) = ", len(corpus_A)) print("len(doc_title_list) = ", len(doc_title_list)) print("i_num_tokens_max = ", i_num_tokens_max) print("i_num_tokens_min = ", i_num_tokens_min) #sys.exit(0) utils.upload_chunked(server, corpus, chunksize=1000) # send 1k docs at a time #service = SessionServer('C:/0_afc_working/0_Doc2Vec/gensim-simserver-master/my_server/') # or wherever service = SessionServer(folder_B) # or wherever logger.info("simberver_local_A: service.train(corpus, method='lsi')") service.train(corpus, method='lsi') service.index(corpus) # index the same documents that we trained on... #sys.exit(0) #service.delete(['doc_5', 'doc_8']) # supply a list of document ids to be removed from the index #service.index(corpus[:3]) # overall index size unchanged (just 3 docs overwritten)
from flask.ext.pymongo import PyMongo import datetime from simserver import SessionServer from gensim import utils import itertools from pymongo import MongoClient sim_server = SessionServer('./tmp/idea_match_server') client = MongoClient('localhost', 3001) db = client.meteor cursor = db.ideas.find({}) corpus = [{ 'id': idea['_id'], 'tokens': utils.simple_preprocess(idea['text']) } for idea in cursor] utils.upload_chunked(sim_server, corpus, chunksize=1000) sim_server.train(corpus, method='lsi') sim_server.index(corpus) app = Flask(__name__) app.config['MONGO_HOST'] = 'localhost' app.config['MONGO_PORT'] = 3001 app.config['MONGO_DBNAME'] = 'meteor' mongo = PyMongo(app) class Idea(Document): structure = { 'text': unicode, 'parent_id': unicode, 'date_created': datetime.datetime,
def index(self, texts): corpus = self._create_corpus(texts) utils.upload_chunked(self.server, corpus, chunksize=1000) self.server.train(corpus, method='lsi') self.server.index(corpus)
def test_Gensim(texts): gsDir = os.getcwd() logger.debug(u"GSDir %s" % gsDir) gss = gsDir + os.sep + u"gensim_server" + os.sep logger.debug(u"%s" % gss) server = SessionServer(gss) u""" texts = [u"Human machine interface for lab abc computer applications", u"A survey of user opinion of computer system response time", u"The EPS user interface management system", u"System and human system engineering testing of EPS", u"Relation of user perceived response time to error measurement", u"The generation of random binary unordered trees", u"The intersection graph of paths in trees", u"Graph minors IV Widths of trees and well quasi ordering", u"Graph minors A survey", u"Why use a computer"] """ logger.info(u"%s" % server.status()) corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)] # send 1k docs at a time utils.upload_chunked(server, corpus, chunksize=1000) server.train(corpus, method=u"lsi") # index the same documents that we trained on... server.index(corpus) # supply a list of document ids to be removed from the index # server.delete(["doc_5", "doc_8"]) # overall index size unchanged (just 3 docs overwritten) server.index(corpus[:3]) # Option Ons for n in range(0, len(texts)): doc = u"doc_%d" % n logger.info(u"Find similar doc_%d to %s" % (n, corpus[n][u"tokens"])) for sim in server.find_similar(doc): m = int(sim[0][-1:]) if m != n: logger.info(u"\t%s \t %3.2f : %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"])) d = [unicode(x) for x in corpus[n][u"tokens"]] e = [unicode(y) for y in corpus[m][u"tokens"]] s1 = set(e) s2 = set(d) common = s1 & s2 lc = [x for x in common] logger.info(u"\tCommon Topics : %s\n" % (lc)) if False: # Option two doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")} logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
from bson.objectid import ObjectId from flask import Flask, request from mongokit import Document from flask.ext.pymongo import PyMongo import datetime from simserver import SessionServer from gensim import utils import itertools from pymongo import MongoClient sim_server = SessionServer('./tmp/idea_match_server') client = MongoClient('localhost', 3001) db = client.meteor cursor = db.ideas.find({}) corpus = [{'id': idea['_id'], 'tokens': utils.simple_preprocess(idea['text'])} for idea in cursor] utils.upload_chunked(sim_server, corpus, chunksize=1000) sim_server.train(corpus, method='lsi') sim_server.index(corpus) app = Flask(__name__) app.config['MONGO_HOST'] = 'localhost' app.config['MONGO_PORT'] = 3001 app.config['MONGO_DBNAME'] = 'meteor' mongo = PyMongo(app) class Idea(Document): structure = { 'text':unicode, 'parent_id': unicode, 'date_created': datetime.datetime,