コード例 #1
0
def with_synonyme_meal():
	for i in range(0,len(label_meal_db)):
		#for i in range(0,3):
		label_list=label_meal_db[i]
		label_id=label_list['id']
		label=label_list['name']
		label_translate_synonymes=translate_synonymes(label)
		#label_translate_synonymes=label
		#label_dic.append({'id': 'doc_%i' % label_id, 'tokens': [label_translate_synonymes], 'payload': label_translate_synonymes})
		label_dic.append({'id': 'doc_%i' % label_id, 'tokens': cut(label_translate_synonymes), 'payload': label})
		logger.info(i)
		logger.info('label_id= %s' % label_id)
	'''
	for j in range(0,len(mysql_db)):
		mysql_data_list=mysql_db[j]
		article_id=mysql_data_list[0]	#id
		article_label=mysql_data_list[1] #label
		article_title=mysql_data_list[2] #title
		article_text=mysql_data_list[4] #text
		if article_title==None:
			article_title=''
		if article_text==None:
			article_text=''
		article_title_text=article_title+article_text
		article_title_text_translate_synonymes=translate_synonymes(article_title_text)
		article_title_text_dic.append({'id': 'doc_%i' % article_id, 'tokens': cut(article_title_text_translate_synonymes), 'payload': article_title_text})
	'''
	server_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'servers/create_test_withsyn_meal1',)  #--model path
	server = SessionServer(server_path)  
	server.drop_index()  #--删除所有索引
	utils.upload_chunked(server, label_dic, chunksize=1000) #--simserver分块处理
	server.train(label_dic, method='lsi')  #--训练已处理后的问题
	server.index(label_dic)  #--建立索引文件
	#print(server.status())
	return None
コード例 #2
0
def GensimClient(texts):
    similarities = None

    gsDir = os.getcwd()
    gss = gsDir + os.sep + u"gensim_server" + os.sep
    server = SessionServer(gss)

    logger.debug(u"%s" % server.status())

    try:
        corpus = [{
            u"id": u"doc_%i" % num,
            u"tokens": utils.simple_preprocess(text)
        } for num, text in enumerate(texts)]

        # send 1k docs at a time
        utils.upload_chunked(server, corpus, chunksize=1000)

        server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        server.index(corpus)

        similarities = findSimilar(texts, server, corpus)

    except Exception, msg:
        logger.debug(u"%s" % msg)
コード例 #3
0
ファイル: server.py プロジェクト: kwyn/SSASS
def get_service():
  SERVER_DIR = '/tmp/simserver/'
  try:
    os.mkdir(SERVER_DIR)
  except:
    pass
  service = SessionServer(SERVER_DIR)
  service.set_autosession()
  return service
コード例 #4
0
def service_initialization(directory_path='.',
                           readme_path='.',
                           autosession=True):
    #'../Extract_features_using_readmeAPIsource/', directory to place this service
    #'./Readme/Readme_set_complete', directory where the readme file source is stored.
    service = SessionServer(directory_path, autosession)
    if 'model' not in os.listdir(directory_path + '/a/'):
        upload_train(service, readme_path)
    return service
コード例 #5
0
def with_synoymes_meal():
    km_server = SessionServer(
        os.path.join(servers_path, 'create_test_withsyn_meal1'))  #--索引
    article_db = db.query('select * from article_all1')

    min_similarity = 0.1  #0.2
    max_results = 5  #2
    #db.execute('update article_all1 set meal=null') #initial

    for i in range(0, len(article_db)):
        #for i in range(0,3):
        article_list = article_db[i]
        article_id = article_list['id']
        title = article_list['title']
        introduce = article_list['introduce']
        content = article_list['content']
        js_content = json.loads(content)
        content_all = ''
        for at in range(0, len(js_content)):
            js_content_list = js_content[at]
            js_content_content = js_content_list['content']
            js_content_title = js_content_list['title']
            soup_js_content_title = BeautifulSoup(js_content_title)
            soup_js_content_content = BeautifulSoup(js_content_content)
            soup_title = soup_js_content_title.get_text()
            soup_content = soup_js_content_content.get_text()
            content_all = content_all + soup_title + '.' + soup_content
            content_all = content_all.replace("\n", "")
        article = title + '.' + introduce + '.' + content_all
        #print(article)
        article_synonymes = translate_synonymes(article)  #--数据库问题同义词转换
        article_label_list = add_label(article_synonymes, min_similarity,
                                       max_results, km_server)
        #print(article_id)
        #print(article_id,article_label_list)
        #print
        label_list_sql = []
        label_list_sql_sim = []
        for l in article_label_list:
            label_id = l[0][4:]
            similarity = l[1]
            label = l[2]
            label_list_sql.append(label)
            label_list_sql_sim.append((similarity, label))
            label_list_sql_sim_json = json.dumps(label_list_sql_sim)
            #print(article_id,label_id,similarity)
            #print(article_id)
            #print(label_id)
            #db.execute('update article_all1 set meal=%s where id=%s',(label_list_sql,article_id))
            db.execute('update article_all1 set meal_sim=%s where id=%s',
                       (label_list_sql_sim, article_id))
            db.execute('update article_all1 set meal_sim_json=%s where id=%s',
                       (label_list_sql_sim_json, article_id))

        #print(label_list_sql)
        #print('-'*20)
    return None
コード例 #6
0
def GensimClient(texts):
    gsDir = os.getcwd()
    logger.debug(u"GSDir %s" % gsDir)

    gss = gsDir + os.sep + u"gensim_server" + os.sep
    logger.debug(u"%s" % gss)

    server = SessionServer(gss)

    logger.info(u"%s" % server.status())

    corpus = [{u"id": u"url_%i" % n, u"tokens": utils.simple_preprocess(text)} for n, text in enumerate(texts)]

    # send 1k docs at a time
    utils.upload_chunked(server, corpus, chunksize=1000)

    server.train(corpus, method=u"lsi")

    # index the same documents that we trained on...
    server.index(corpus)

    # supply a list of document ids to be removed from the index
    # server.delete(["doc_5", "doc_8"])

    # overall index size unchanged (just 3 docs overwritten)
    server.index(corpus[:3])

    # Option Ons
    for n in range(0, len(corpus)):
        doc = u"doc_%d" % n
        logger.info(u"------------------------------------------------------")
        logger.info(u"Find similar N doc_%d to %s" % (n, corpus[n][u"tokens"]))
        logger.info(u"------------------------------------------------------")
        for sim in server.find_similar(doc):
            m = int(sim[0][-1:])
            if m != n:
                logger.info(u"\t%s \t %3.2f : M %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"]))

                d = [unicode(x) for x in corpus[n][u"tokens"]]
                e = [unicode(y) for y in corpus[m][u"tokens"]]

                s1 = set(e)
                s2 = set(d)
                common = s1 & s2
                lc = [x for x in common]
                logger.info(u"\t\tCommon Topics : %s" % (lc))

    if False:
        # Option two
        doc = {u"tokens": utils.simple_preprocess(str("Graph and minors and humans and trees."))}
        logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))
コード例 #7
0
    def findSimilarities(self, texts):
        gsDir = os.getcwd()
        logger.debug(u"GSDir %s" % gsDir)

        gss = gsDir + os.sep + u"gensim_server" + os.sep
        logger.debug(u"%s" % gss)

        server = SessionServer(gss)

        corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)]

        # send 1k docs at a time
        # utils.upload_chunked(server, corpus, chunksize=1000)

        # server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        # server.index(corpus)

        # overall index size unchanged (just 3 docs overwritten)
        # server.index(corpus[:3])

        # Option Ons
        if True:
            for n in range(0, len(texts)):
                doc = u"doc_%d" % n
                self.output += u"Find similar doc_%d to %s%s" % (n, corpus[n][u"tokens"], os.linesep)
                logger.info(self.output[:-1])

                for sim in server.find_similar(doc):
                    m = int(sim[0][-1:])
                    if m != n:
                        self.output += u"\t%s \t %3.2f : %s%s" % (sim[0], float(sim[1]), corpus[m][u"tokens"], os.linesep)
                        logger.info(self.output[:-1])

                        d = [unicode(x) for x in corpus[n][u"tokens"]]
                        e = [unicode(y) for y in corpus[m][u"tokens"]]

                        s1 = set(e)
                        s2 = set(d)
                        common = s1 & s2
                        lc = [x for x in common]
                        self.output += u"\tCommon Topics : %s%s" % (lc, os.linesep)
                        logger.info(self.output[:-1])

            else:
                # Option two
                doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")}
                logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))

        return self.output
コード例 #8
0
ファイル: functions.py プロジェクト: fizzy123/archives
def index_nodes():
    print "loading server"
    service = SessionServer('/mnt/hgfs/Shared/my_server/')
    print "loading model"
    service.open_session()
    service.session.drop_index()
    service.session.model = simserver.SimModel.load("/mnt/hgfs/Shared/wiki")
    print service.session.model
    print "loading nodes"
    nodes = Node.objects.all()
    print "Building corpus"
    corpus = [{'id':node.pk,'tokens':re.findall(r"[\w']+",node.question.lower())} for node in nodes]
    print "indexing corpus"
    service.index(corpus)
    print service.stable.keys
    service.commit()
コード例 #9
0
 def __init__(self):
     self.server = SessionServer(r'c:\temp\data_server')
     print self.server
コード例 #10
0
 def __init__(self):
     self.service = SessionServer('SearchServer/')
コード例 #11
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import utils
from simserver import SessionServer
service = SessionServer('c:/temp/gensim')  # or wherever


def index_input_texts():
    texts = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]
    corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)}
              for num, text in enumerate(texts)]
    # service.index(corpus)
    service.train(corpus, method='lsi')
    service.index(corpus)  # index the same documents that we trained on...

def query_the_index(input):
    doc = {'tokens': utils.simple_preprocess(input)}
コード例 #12
0
ファイル: functions.py プロジェクト: fizzy123/archives
        while n.content[0:3] == '-->':
            if n.content[3:5] == '*.':
                if Tag.objects.filter(title=n.content[5:]).exists():
                    tag = Tag.objects.get(title=n.content[5:])
                    if tag.node_set.all().exists():
                        n = choice(tag.node_set.all())
            else:
                if Node.objects.filter(title=n.content[3::]).exists():
                    n = Node.objects.get(title=n.content[3::])
                else:
                    log.debug('%s not found' % name)
                    n = Node.objects.get(title='idk')
        context = {'reply': parse_content(n.content, 'display'), 'title':n.title}
        return json_response(context), n

service = SessionServer('/mnt/hgfs/Shared/my_server/')
service.stable.model = simserver.SimModel.load("/mnt/hgfs/Shared/wiki")

def parse(arguments, method):
    name = arguments['name']
    if method == 'GET':
        n = None
        while not n:
            matches = service.find_similar({'tokens':re.findall(r"[\w']+", name)},.9)
            if len(matches):
                n = Node.objects.get(pk=matches[0][0])
            else:    
                matches = service.find_similar({'tokens':re.findall(r"[\w']+",name)},.8)
                if len(matches):
                    n = Node.objects.get(pk=matches[0][0])
                else:
コード例 #13
0
    'v3', 'v4', 'v5', 'v9', 'w', 'x', 'z'
]

i_tag_num_threshold = 5

#===========================
#===========================
i_1000_flag = 1
#i_1000_flag = 0
#===========================
#===========================

#server = SessionServer('/tmp/my_server') # resume server (or create a new one)
#server = SessionServer('./my_server') # resume server (or create a new one)
#server = SessionServer('./my_server_A') # resume server (or create a new one)
server = SessionServer(folder_A)  # resume server (or create a new one)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

logger = logging.getLogger('gensim.similarities.simserver')


def load_words():
    with open('words_alpha.txt') as word_file:
        valid_words = set(word_file.read().split())

    return valid_words

コード例 #14
0
ファイル: simserver_test.py プロジェクト: jannson/Similar
        doc['id'] = 'html_%d' % obj.id
        doc['tokens'] = list(Tokenize(obj.content))
        if obj.id % 1000 == 0:
            print 'processing', obj.id
        yield doc

def iter_corpus():
    for obj in SogouCorpus.objects.all():
        doc = {}
        doc['id'] = 'sogou_%d' % obj.id
        doc['tokens'] = obj.tokens.split(',')
        if obj.id % 1000 == 0:
            print 'processing', obj.id
        yield doc

server = SessionServer('/tmp/server')
#server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
def train_server():
    training_corpus = iter_documents()
    #training_corpus = iter_corpus()
    #server.train(list(training_corpus), method='lsi')
    #print 'train finished'
    server.index(training_corpus)
    print 'index finished'
    server.optimize()
    print 'optimize finished'

def update_keywords():
    for html in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')):
        html.tags,html.summerize = summarize(html.content)
        html.summerize = html.summerize[0:388]
コード例 #15
0
            tokens = preprocessor.tokenize(qtext)
            tokens = map(preprocessor.deNoise, tokens)
            devocalize_tokens = map(preprocessor.removeDiacritics, tokens)
            denoised_tokens = map(preprocessor.deNoise, devocalize_tokens)
            normalized_tokens = map(preprocessor.normalizeAlef,
                                    denoised_tokens)
            normalized_tokens = map(preprocessor.normalizeAggressive,
                                    normalized_tokens)
            lemmatized_tokens = map(preprocessor.lemmatize, normalized_tokens)

            yield LabeledSentence(words=[w for w in tokens], tags=['%s' % qid])


from simserver import SessionServer

service = SessionServer('tmp/')

service.train(corpus, method='lsi')

import sys


class QuestionPairSimilarity(object):
    def __iter__(self):

        qs = LabeledQuestion('input/SemEval2016-Task3-CQA-MD-test.xml')
        for q in qs:

            service.drop_index()
            qid = q.tags[0]
            print qid
コード例 #16
0
 def __init__(self):
     self.server = SessionServer("./tmp")
コード例 #17
0
    def resume_scoring(self):
        """"
            Cleanes the data and runs the resume matching code. User is
            requested to pass the job description name, session_name and
            final output file name. Final output is an excel file.
 
            @param: job_description - string
            @param: session_name - string
            @param: output_filename - string
 
            Once you run this code it will prompt you to select the path of the
            directory           
        """
 
 
        self.job_description = self.select_job_description()
        if len(self.job_description) > 0:
 
            #self.job_description_path = os.path.join( self.job_description_path + "/" + job_description)
 
            self.raw_resumes_path =self.select_resume_path()
            if len(self.raw_resumes_path) > 0:               
                self.save_text_files_path = self.select_rawtext_path()
 
                self.raw_resumes_to_text()
                self.jd_to_text()
 
                self.file_list_text = glob.glob(self.save_text_files_path + "/*.*")
                print self.file_list_text
 
                self.resume_id = []
                for i in range(0, len(self.file_list_text)):
                    self.resume_id.append([int(s) for s in self.file_list_text[i].split() if s.isdigit()])
 
                self.documents = []
                for filename in self.file_list_text:
                    with open(filename, 'r') as f:
                        #d = f.read()
                        #print d
                        self.documents.append(f.read())
 
                self.corpus = [{'id': 'doc_%s' % num, 'tokens': utils.simple_preprocess(text)}
                  for num, text in enumerate(self.documents)]
 
                self.count = 0
                while self.count < len(self.resume_id):   
                    for item in self.corpus:
                        if self.resume_id[self.count] == []:
                            item['id'] = 'doc_jd'
                        else:
                            item['id'] = str(self.resume_id[self.count])
                        self.count =  self.count + 1
 
                self.regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
                self.tokenized_corpus_no_punctuation = []
 
                for review in self.corpus:       
                    self.new_corpus = []
                    for token in review:
                        self.new_token = self.regex.sub(u'', token)
                        if not self.new_token == u'':
                            self.new_corpus.append(self.new_token)       
                    self.tokenized_corpus_no_punctuation.append(self.new_corpus)
 
                self.dir_name = self.setting_up_server_session_dir()       
                self.server = SessionServer(self.dir_name)
 
                logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
                self.server.train(self.corpus, method='lsi')
                self.server.index(self.corpus)
                self.lst = self.server.find_similar('doc_jd')
                self.series = pd.DataFrame(self.lst)
                self.series.columns = ['Resume_ID', 'Score', 'none']
                self.series.index.names = ['Rank']
 
                self.series = self.series.drop(self.series.columns[2], axis = 1)       
                self.final_excel_path()
コード例 #18
0
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 10 14:34:49 2018

@author: afcarl
"""

from gensim import utils
from simserver import SessionServer

import gensim

#server = SessionServer('/tmp/my_server') # resume server (or create a new one)
server = SessionServer('./my_server') # resume server (or create a new one)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

logger = logging.getLogger('gensim.similarities.simserver')

document = {'id': 'some_unique_string',
            'tokens': ['content', 'of', 'the', 'document', '...'],
            'other_fields_are_allowed_but_ignored': None}

from gensim import utils
texts = ["Human machine interface for lab abc computer applications",
         "A survey of user opinion of computer system response time",
         "The EPS user interface management system",
         "System and human system engineering testing of EPS",
         "Relation of user perceived response time to error measurement",
         "The generation of random binary unordered trees",
コード例 #19
0
import json
from bson import json_util
from bson.objectid import ObjectId
from flask import Flask, request
from mongokit import Document
from flask.ext.pymongo import PyMongo
import datetime
from simserver import SessionServer
from gensim import utils
import itertools
from pymongo import MongoClient

sim_server = SessionServer('./tmp/idea_match_server')
client = MongoClient('localhost', 3001)
db = client.meteor
cursor = db.ideas.find({})
corpus = [{
    'id': idea['_id'],
    'tokens': utils.simple_preprocess(idea['text'])
} for idea in cursor]
utils.upload_chunked(sim_server, corpus, chunksize=1000)
sim_server.train(corpus, method='lsi')
sim_server.index(corpus)

app = Flask(__name__)
app.config['MONGO_HOST'] = 'localhost'
app.config['MONGO_PORT'] = 3001
app.config['MONGO_DBNAME'] = 'meteor'
mongo = PyMongo(app)

コード例 #20
0
import os
from gensim import utils
from simserver import SessionServer

server = SessionServer('myserver')
w = open('data/1/549518.txt').read()

docin = {'id': '549518', 'tokens' : utils.simple_preprocess(w)}

print server.find_similar(docin)
コード例 #21
0
#an example by Steven Du, showing how to use this server for Chinese documents

# train: let the server learn the LSI model
# index: setup your own pool of documents that you want the query to search 
# find_similar : find the similar documents in the indexed pool of documents.
# Input to this server (train,index,find_similar) is a list of {'id': 'doc_%i' % num, 'tokens': text.split()}


from simserver import SessionServer
import codecs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

serverFilePath='./temp_index_dir'

server = SessionServer(serverFilePath) # resume server (or create a new one)


texts=['如果 也 没有 的话 。 这个 确实 没有 办法 了 。 我 个人 建议您 重装 一遍 这个 软件 看看 是否 还是 一样 卸载 程序 里 也 没有 呢',
'我能 直接 删掉 这些 文件 吗 ?',
'不 建议 呢 。 因为 不 确定 这些 文件 中 是否 有 其他软件 的 文件 呢',
'好 的 , 使用 看看 会断 么',
'它 只是 有时 自动 掉 , 以后 看看 怎么样',
'这个 是 您 无线 驱动 : http : / / driverdl . lenovo . com . cn / lenovo / driverfilesuploadfloder / 32228 / wlan _ win8 . 1 . exe',
'要是 问题 还是 出现 您 可以 安装 这个 试试',
'10 几个 版本 都 试过 了 么',
'目前 可以 确认 08 版本 以上 正常 运行',
'这个 是 电源 吧',
'http : / / weixin . lenovo . com . cn / img / files / user _ files / olhctjgaid22zzdnezguwbxzuxrq / voice / 16 _ 03 _ 17 / 1104209 _ 729724 _ 1458213046 . jpg',
'现在 不是 运行 问题 , 是 安装 问题',
'点 电源 卸载 没 反应 呢',
コード例 #22
0
from flask import Flask
from flask import json
from flask import request
from flask import Response
import os
app = Flask(__name__)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

from gensim import utils

from simserver import SessionServer
#BEFORE TRAINING NEW MODEL - CHANGE PATH BELOW
service = SessionServer('/tmp/mirFlickr4500')

# FORMAT FOR DATA POSTED TO /index: {"id":NUMBER,"tokens":["STRING","STRING","STRING"]}


@app.route('/test', methods=['GET'])
def test():
    return "server is running"


@app.route('/index', methods=['POST'])
def indexPhoto():
    print(request.json)
    service.index(request.json)
    return "Recieved: " + json.dumps(request.json)
コード例 #23
0
#coding=utf-8

from simserver import SessionServer
server = SessionServer('/tmp/my_server')  # resume server (or create a new one)
コード例 #24
0
ファイル: QueryIndex.py プロジェクト: gopigof/Search-Engine
 def __init__(self):
     self.service = SessionServer('SearchServer/')
     self.search_results = []
コード例 #25
0
for link in urls:
    print "Reading page: " + `link`
    status, response = http.request(link)
    crawldocs[link] = response

len(crawldocs)
for link, raw_html in crawldocs.iteritems():
    maincontent[link] = g.extract(raw_html = raw_html)

len(maincontent)
response
originaldoc_maincontent = g.extract(raw_html=response)
from simserver
from simserver import SessionServer
from simserver import SessionServer
server = SessionServer('/tmp/my_simserver')
import logging
corpus = [{'id
corpus = [{'id':link, 'tokens':content.cleaned_text} for link, content in maincontent.iteritems()]
len(corpus)
corpus[1]
from gensim import utils
service.train(corpus, method='lsi')
server.train(corpus, method='lsi')
corpus = [{'id':link, 'tokens':(content.cleaned_text,)} for link, content in maincontent.iteritems()]
corpus[1]
server.train(corpus, method='lsi')
server.index(corpus)
corpus = [{'id':link, 'tokens':utils.simple_preprocess(content.cleaned_text,)} for link, content in maincontent.iteritems()]
server.train(corpus, method='lsi')
server.index(corpus)