def with_synonyme_meal():
	for i in range(0,len(label_meal_db)):
		#for i in range(0,3):
		label_list=label_meal_db[i]
		label_id=label_list['id']
		label=label_list['name']
		label_translate_synonymes=translate_synonymes(label)
		#label_translate_synonymes=label
		#label_dic.append({'id': 'doc_%i' % label_id, 'tokens': [label_translate_synonymes], 'payload': label_translate_synonymes})
		label_dic.append({'id': 'doc_%i' % label_id, 'tokens': cut(label_translate_synonymes), 'payload': label})
		logger.info(i)
		logger.info('label_id= %s' % label_id)
	'''
	for j in range(0,len(mysql_db)):
		mysql_data_list=mysql_db[j]
		article_id=mysql_data_list[0]	#id
		article_label=mysql_data_list[1] #label
		article_title=mysql_data_list[2] #title
		article_text=mysql_data_list[4] #text
		if article_title==None:
			article_title=''
		if article_text==None:
			article_text=''
		article_title_text=article_title+article_text
		article_title_text_translate_synonymes=translate_synonymes(article_title_text)
		article_title_text_dic.append({'id': 'doc_%i' % article_id, 'tokens': cut(article_title_text_translate_synonymes), 'payload': article_title_text})
	'''
	server_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'servers/create_test_withsyn_meal1',)  #--model path
	server = SessionServer(server_path)  
	server.drop_index()  #--删除所有索引
	utils.upload_chunked(server, label_dic, chunksize=1000) #--simserver分块处理
	server.train(label_dic, method='lsi')  #--训练已处理后的问题
	server.index(label_dic)  #--建立索引文件
	#print(server.status())
	return None
Beispiel #2
0
def GensimClient(texts):
    similarities = None

    gsDir = os.getcwd()
    gss = gsDir + os.sep + u"gensim_server" + os.sep
    server = SessionServer(gss)

    logger.debug(u"%s" % server.status())

    try:
        corpus = [{
            u"id": u"doc_%i" % num,
            u"tokens": utils.simple_preprocess(text)
        } for num, text in enumerate(texts)]

        # send 1k docs at a time
        utils.upload_chunked(server, corpus, chunksize=1000)

        server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        server.index(corpus)

        similarities = findSimilar(texts, server, corpus)

    except Exception, msg:
        logger.debug(u"%s" % msg)
Beispiel #3
0
def train_model(service=None):
    if service == None:
        raise ValueError(
            "You should service value in train_model using result from service_initialization.\n"
        )
    corpus = [{
        'id': file_name,
        'tokens': utils.simple_preprocess(text)
    } for file_name, text in _texts.items()]
    utils.upload_chunked(service, corpus, chunksize=1000)
    service.train(corpus, method='lsi')
    service.index(corpus)
    return
Beispiel #4
0
def GensimClient(texts):
    gsDir = os.getcwd()
    logger.debug(u"GSDir %s" % gsDir)

    gss = gsDir + os.sep + u"gensim_server" + os.sep
    logger.debug(u"%s" % gss)

    server = SessionServer(gss)

    logger.info(u"%s" % server.status())

    corpus = [{u"id": u"url_%i" % n, u"tokens": utils.simple_preprocess(text)} for n, text in enumerate(texts)]

    # send 1k docs at a time
    utils.upload_chunked(server, corpus, chunksize=1000)

    server.train(corpus, method=u"lsi")

    # index the same documents that we trained on...
    server.index(corpus)

    # supply a list of document ids to be removed from the index
    # server.delete(["doc_5", "doc_8"])

    # overall index size unchanged (just 3 docs overwritten)
    server.index(corpus[:3])

    # Option Ons
    for n in range(0, len(corpus)):
        doc = u"doc_%d" % n
        logger.info(u"------------------------------------------------------")
        logger.info(u"Find similar N doc_%d to %s" % (n, corpus[n][u"tokens"]))
        logger.info(u"------------------------------------------------------")
        for sim in server.find_similar(doc):
            m = int(sim[0][-1:])
            if m != n:
                logger.info(u"\t%s \t %3.2f : M %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"]))

                d = [unicode(x) for x in corpus[n][u"tokens"]]
                e = [unicode(y) for y in corpus[m][u"tokens"]]

                s1 = set(e)
                s2 = set(d)
                common = s1 & s2
                lc = [x for x in common]
                logger.info(u"\t\tCommon Topics : %s" % (lc))

    if False:
        # Option two
        doc = {u"tokens": utils.simple_preprocess(str("Graph and minors and humans and trees."))}
        logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
Beispiel #5
0
def GensimClient(texts):
    similarities = None

    gsDir = os.getcwd()
    gss = gsDir + os.sep + u"gensim_server" + os.sep
    server = SessionServer(gss)

    logger.debug(u"%s" % server.status())

    try:
        corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)]

        # send 1k docs at a time
        utils.upload_chunked(server, corpus, chunksize=1000)

        server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        server.index(corpus)

        similarities = findSimilar(texts, server, corpus)

    except Exception, msg:
        logger.debug(u"%s" % msg)
print("len(entire_vocab_set) = ", len(entire_vocab_set))
print("i_accum_vocab = ", i_accum_vocab)

print("len(doc_list_vocab_set) = ", len(doc_list_vocab_set))

print("len(split_word_set) = ", len(split_word_set))
#print("len(split_word_set) = " , len(split_word_set) )

print("i_accum = ", i_accum)
print("len(corpus) = ", len(corpus_A))
print("len(doc_title_list) = ", len(doc_title_list))
print("i_num_tokens_max = ", i_num_tokens_max)
print("i_num_tokens_min = ", i_num_tokens_min)
#sys.exit(0)

utils.upload_chunked(server, corpus, chunksize=1000)  # send 1k docs at a time

#service = SessionServer('C:/0_afc_working/0_Doc2Vec/gensim-simserver-master/my_server/') # or wherever
service = SessionServer(folder_B)  # or wherever

logger.info("simberver_local_A: service.train(corpus, method='lsi')")

service.train(corpus, method='lsi')

service.index(corpus)  # index the same documents that we trained on...
#sys.exit(0)

#service.delete(['doc_5', 'doc_8']) # supply a list of document ids to be removed from the index

#service.index(corpus[:3]) # overall index size unchanged (just 3 docs overwritten)
Beispiel #7
0
from flask.ext.pymongo import PyMongo
import datetime
from simserver import SessionServer
from gensim import utils
import itertools
from pymongo import MongoClient

sim_server = SessionServer('./tmp/idea_match_server')
client = MongoClient('localhost', 3001)
db = client.meteor
cursor = db.ideas.find({})
corpus = [{
    'id': idea['_id'],
    'tokens': utils.simple_preprocess(idea['text'])
} for idea in cursor]
utils.upload_chunked(sim_server, corpus, chunksize=1000)
sim_server.train(corpus, method='lsi')
sim_server.index(corpus)

app = Flask(__name__)
app.config['MONGO_HOST'] = 'localhost'
app.config['MONGO_PORT'] = 3001
app.config['MONGO_DBNAME'] = 'meteor'
mongo = PyMongo(app)


class Idea(Document):
    structure = {
        'text': unicode,
        'parent_id': unicode,
        'date_created': datetime.datetime,
 def index(self, texts):
     corpus = self._create_corpus(texts)
     utils.upload_chunked(self.server, corpus, chunksize=1000)
     self.server.train(corpus, method='lsi')
     self.server.index(corpus)
 def index(self, texts):
     corpus = self._create_corpus(texts)
     utils.upload_chunked(self.server, corpus, chunksize=1000)
     self.server.train(corpus, method='lsi')
     self.server.index(corpus)
Beispiel #10
0
def test_Gensim(texts):
    gsDir = os.getcwd()
    logger.debug(u"GSDir %s" % gsDir)

    gss = gsDir + os.sep + u"gensim_server" + os.sep
    logger.debug(u"%s" % gss)

    server = SessionServer(gss)

    u""" texts = [u"Human machine interface for lab abc computer applications",
             u"A survey of user opinion of computer system response time",
             u"The EPS user interface management system",
             u"System and human system engineering testing of EPS",
             u"Relation of user perceived response time to error measurement",
             u"The generation of random binary unordered trees",
             u"The intersection graph of paths in trees",
             u"Graph minors IV Widths of trees and well quasi ordering",
             u"Graph minors A survey",
             u"Why use a computer"]
    """

    logger.info(u"%s" % server.status())

    corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)]

    # send 1k docs at a time
    utils.upload_chunked(server, corpus, chunksize=1000)

    server.train(corpus, method=u"lsi")

    # index the same documents that we trained on...
    server.index(corpus)

    # supply a list of document ids to be removed from the index
    # server.delete(["doc_5", "doc_8"])

    # overall index size unchanged (just 3 docs overwritten)
    server.index(corpus[:3])

    # Option Ons
    for n in range(0, len(texts)):
        doc = u"doc_%d" % n
        logger.info(u"Find similar doc_%d to %s" % (n, corpus[n][u"tokens"]))
        for sim in server.find_similar(doc):
            m = int(sim[0][-1:])
            if m != n:
                logger.info(u"\t%s \t %3.2f : %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"]))

                d = [unicode(x) for x in corpus[n][u"tokens"]]
                e = [unicode(y) for y in corpus[m][u"tokens"]]

                s1 = set(e)
                s2 = set(d)
                common = s1 & s2
                lc = [x for x in common]
                logger.info(u"\tCommon Topics : %s\n" % (lc))

    if False:
        # Option two
        doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")}
        logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
Beispiel #11
0
from bson.objectid import ObjectId
from flask import Flask, request
from mongokit import Document
from flask.ext.pymongo import PyMongo
import datetime
from simserver import SessionServer
from gensim import utils
import itertools
from pymongo import MongoClient

sim_server = SessionServer('./tmp/idea_match_server')
client = MongoClient('localhost', 3001)
db = client.meteor
cursor = db.ideas.find({})
corpus = [{'id': idea['_id'], 'tokens': utils.simple_preprocess(idea['text'])} for idea in cursor]
utils.upload_chunked(sim_server, corpus, chunksize=1000)
sim_server.train(corpus, method='lsi')
sim_server.index(corpus)

app = Flask(__name__)
app.config['MONGO_HOST'] = 'localhost'
app.config['MONGO_PORT'] = 3001
app.config['MONGO_DBNAME'] = 'meteor'
mongo = PyMongo(app)


class Idea(Document):
    structure = {
        'text':unicode,
        'parent_id': unicode,
        'date_created': datetime.datetime,