Python SessionServer Examples, simserver.SessionServer Python Examples

Example #1

0

Show file

File: indexer.py Project: sauravbiswasiupr/recommender-lsi

class Indexer(object):
    def __init__(self):
        self.server = SessionServer("./tmp")

    def _create_corpus(self, texts):
        corpus = []
        for id, text in texts:
            corpus.append({
                'id': id,
                'tokens': utils.simple_preprocess(text)
            })
        return corpus

    def index(self, texts):
        corpus = self._create_corpus(texts)
        utils.upload_chunked(self.server, corpus, chunksize=1000)
        self.server.train(corpus, method='lsi')
        self.server.index(corpus)

    def add_documents(self, texts):
        self.index(texts)

    def recommend(self, id, max_results=10):
        print "Id is: ", id
        return self.server.find_similar(id, max_results=max_results)

Example #2

0

Show file

File: server.py Project: kwyn/SSASS

def get_service():
  SERVER_DIR = '/tmp/simserver/'
  try:
    os.mkdir(SERVER_DIR)
  except:
    pass
  service = SessionServer(SERVER_DIR)
  service.set_autosession()
  return service

Example #3

0

Show file

    def findSimilarities(self, texts):
        gsDir = os.getcwd()
        logger.debug(u"GSDir %s" % gsDir)

        gss = gsDir + os.sep + u"gensim_server" + os.sep
        logger.debug(u"%s" % gss)

        server = SessionServer(gss)

        corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)]

        # send 1k docs at a time
        # utils.upload_chunked(server, corpus, chunksize=1000)

        # server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        # server.index(corpus)

        # overall index size unchanged (just 3 docs overwritten)
        # server.index(corpus[:3])

        # Option Ons
        if True:
            for n in range(0, len(texts)):
                doc = u"doc_%d" % n
                self.output += u"Find similar doc_%d to %s%s" % (n, corpus[n][u"tokens"], os.linesep)
                logger.info(self.output[:-1])

                for sim in server.find_similar(doc):
                    m = int(sim[0][-1:])
                    if m != n:
                        self.output += u"\t%s \t %3.2f : %s%s" % (sim[0], float(sim[1]), corpus[m][u"tokens"], os.linesep)
                        logger.info(self.output[:-1])

                        d = [unicode(x) for x in corpus[n][u"tokens"]]
                        e = [unicode(y) for y in corpus[m][u"tokens"]]

                        s1 = set(e)
                        s2 = set(d)
                        common = s1 & s2
                        lc = [x for x in common]
                        self.output += u"\tCommon Topics : %s%s" % (lc, os.linesep)
                        logger.info(self.output[:-1])

            else:
                # Option two
                doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")}
                logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))

        return self.output

Example #4

0

Show file

File: create_and_test1.py Project: T-Javis/Add-labels-to-articles

def with_synonyme_meal():
	for i in range(0,len(label_meal_db)):
		#for i in range(0,3):
		label_list=label_meal_db[i]
		label_id=label_list['id']
		label=label_list['name']
		label_translate_synonymes=translate_synonymes(label)
		#label_translate_synonymes=label
		#label_dic.append({'id': 'doc_%i' % label_id, 'tokens': [label_translate_synonymes], 'payload': label_translate_synonymes})
		label_dic.append({'id': 'doc_%i' % label_id, 'tokens': cut(label_translate_synonymes), 'payload': label})
		logger.info(i)
		logger.info('label_id= %s' % label_id)
	'''
	for j in range(0,len(mysql_db)):
		mysql_data_list=mysql_db[j]
		article_id=mysql_data_list[0]	#id
		article_label=mysql_data_list[1] #label
		article_title=mysql_data_list[2] #title
		article_text=mysql_data_list[4] #text
		if article_title==None:
			article_title=''
		if article_text==None:
			article_text=''
		article_title_text=article_title+article_text
		article_title_text_translate_synonymes=translate_synonymes(article_title_text)
		article_title_text_dic.append({'id': 'doc_%i' % article_id, 'tokens': cut(article_title_text_translate_synonymes), 'payload': article_title_text})
	'''
	server_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'servers/create_test_withsyn_meal1',)  #--model path
	server = SessionServer(server_path)  
	server.drop_index()  #--删除所有索引
	utils.upload_chunked(server, label_dic, chunksize=1000) #--simserver分块处理
	server.train(label_dic, method='lsi')  #--训练已处理后的问题
	server.index(label_dic)  #--建立索引文件
	#print(server.status())
	return None

Example #5

0

Show file

File: gensim_tests.py Project: scilear/text_similarities

class gensim_news(object):
    def __init__(self):
        self.server = SessionServer(r'c:\temp\data_server')
        print self.server

    def initialise(self, docs):
        corpus4server = self.create_server_corpus(docs)
        self.server.train(corpus4server, method='lsi')

    def create_server_corpus(self, docs):
        return [{'id': '%s' % id, 'tokens': simple_preprocess(text)} for id, text in docs.iteritems()]

    def gensim_similarities(self, docs_dict, new=False):
        text4server = self.create_server_corpus(docs_dict)
        sims = self.server.find_similar(text4server[0], min_score=0.90)
        self.server.index(text4server)
        return sims

Example #6

0

Show file

File: simservice.py Project: cleder/restsims

 def __init__(self, path, preprocess, deaccent=True, lowercase=True,
     stemmer=None, stopwords=None):
     self.service = SessionServer(path)
     self.deaccent = deaccent
     self.lowercase = lowercase
     self.preprocess = preprocess
     self.stemmer = stemmer
     self.stopwords = stopwords

Example #7

0

Show file

File: scraperSimServerCreator.py Project: c0cky/MIRCSearch

def main():
    json_data = open('./items.json')
    data = json.load(json_data)
    print 'starting'
    for i in range(0, len(data)-1):
        print i
        s = ""
        identifier = ""
        title = ""
        totalText = ""
        try:
            s = data[i]['identifier']
            identifier = s[0][18:].replace("%3A", "")
            summary = data[i]['desc'][0].strip()
            title = data[i]['title'][0].strip()
            totalText += summary
            totalText += " "
            totalText += title
            totalText += " "
            totalText += identifier
        except:
            print "error"
        documentPayload = ({'identifier':identifier, 'title': title, 'summary' : summary})
        documents.append({'text' : totalText, 'payload' : documentPayload})
    corpus =[{'id': text['payload']['identifier'], 'tokens' : utils.simple_preprocess(text['text']), 'payload' : text['payload']} for num, text in enumerate(documents)]
    service = SessionServer('./thesite/simdatabase')
    service.train(corpus, method='lsi')
    service.index(corpus)
    service.commit()

Example #8

0

Show file

def GensimClient(texts):
    similarities = None

    gsDir = os.getcwd()
    gss = gsDir + os.sep + u"gensim_server" + os.sep
    server = SessionServer(gss)

    logger.debug(u"%s" % server.status())

    try:
        corpus = [{
            u"id": u"doc_%i" % num,
            u"tokens": utils.simple_preprocess(text)
        } for num, text in enumerate(texts)]

        # send 1k docs at a time
        utils.upload_chunked(server, corpus, chunksize=1000)

        server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        server.index(corpus)

        similarities = findSimilar(texts, server, corpus)

    except Exception, msg:
        logger.debug(u"%s" % msg)

Example #9

0

Show file

def service_initialization(directory_path='.',
                           readme_path='.',
                           autosession=True):
    #'../Extract_features_using_readmeAPIsource/', directory to place this service
    #'./Readme/Readme_set_complete', directory where the readme file source is stored.
    service = SessionServer(directory_path, autosession)
    if 'model' not in os.listdir(directory_path + '/a/'):
        upload_train(service, readme_path)
    return service

Example #10

0

Show file

def with_synoymes_meal():
    km_server = SessionServer(
        os.path.join(servers_path, 'create_test_withsyn_meal1'))  #--索引
    article_db = db.query('select * from article_all1')

    min_similarity = 0.1  #0.2
    max_results = 5  #2
    #db.execute('update article_all1 set meal=null') #initial

    for i in range(0, len(article_db)):
        #for i in range(0,3):
        article_list = article_db[i]
        article_id = article_list['id']
        title = article_list['title']
        introduce = article_list['introduce']
        content = article_list['content']
        js_content = json.loads(content)
        content_all = ''
        for at in range(0, len(js_content)):
            js_content_list = js_content[at]
            js_content_content = js_content_list['content']
            js_content_title = js_content_list['title']
            soup_js_content_title = BeautifulSoup(js_content_title)
            soup_js_content_content = BeautifulSoup(js_content_content)
            soup_title = soup_js_content_title.get_text()
            soup_content = soup_js_content_content.get_text()
            content_all = content_all + soup_title + '.' + soup_content
            content_all = content_all.replace("\n", "")
        article = title + '.' + introduce + '.' + content_all
        #print(article)
        article_synonymes = translate_synonymes(article)  #--数据库问题同义词转换
        article_label_list = add_label(article_synonymes, min_similarity,
                                       max_results, km_server)
        #print(article_id)
        #print(article_id,article_label_list)
        #print
        label_list_sql = []
        label_list_sql_sim = []
        for l in article_label_list:
            label_id = l[0][4:]
            similarity = l[1]
            label = l[2]
            label_list_sql.append(label)
            label_list_sql_sim.append((similarity, label))
            label_list_sql_sim_json = json.dumps(label_list_sql_sim)
            #print(article_id,label_id,similarity)
            #print(article_id)
            #print(label_id)
            #db.execute('update article_all1 set meal=%s where id=%s',(label_list_sql,article_id))
            db.execute('update article_all1 set meal_sim=%s where id=%s',
                       (label_list_sql_sim, article_id))
            db.execute('update article_all1 set meal_sim_json=%s where id=%s',
                       (label_list_sql_sim_json, article_id))

        #print(label_list_sql)
        #print('-'*20)
    return None

Example #11

0

Show file

File: SearchServer.py Project: zymITsky/Search-Engine

class IndexContent:
    def __init__(self):
        self.service = SessionServer('SearchServer/')

    def yield_page_text(self):
        for page_file in os.listdir('CrawlData'):
            content = open('CrawlData/' + page_file, 'r')
            page_content = content.read()
            content.close()
            page_url = re.sub('\s', '/', page_file)
            yield page_url, page_content

    def generate_index(self):
        corpus = [{
            'id': '%s' % url,
            'tokens': utils.simple_preprocess(text)
        } for url, text in self.yield_page_text()]
        self.service.train(corpus, method='lsi')
        self.service.index(corpus)

Example #12

0

Show file

File: SearchServer.py Project: NarciScrap/Search-Engine

class IndexContent:

    def __init__(self):
        self.service = SessionServer('SearchServer/')
    

    def yield_page_text(self):
        for page_file in os.listdir('CrawlData'):
            content = open('CrawlData/'+page_file, 'r')
            page_content = content.read()
            content.close()
            page_url = re.sub('\s', '/', page_file)
            yield page_url, page_content

    def generate_index(self):
        corpus = [{'id': '%s' % url, 'tokens': utils.simple_preprocess(text)}
                for url, text in self.yield_page_text()]
        self.service.train(corpus, method='lsi')
        self.service.index(corpus)

Example #13

0

Show file

class gensim_news(object):
    def __init__(self):
        self.server = SessionServer(r'c:\temp\data_server')
        print self.server

    def initialise(self, docs):
        corpus4server = self.create_server_corpus(docs)
        self.server.train(corpus4server, method='lsi')

    def create_server_corpus(self, docs):
        return [{
            'id': '%s' % id,
            'tokens': simple_preprocess(text)
        } for id, text in docs.iteritems()]

    def gensim_similarities(self, docs_dict, new=False):
        text4server = self.create_server_corpus(docs_dict)
        sims = self.server.find_similar(text4server[0], min_score=0.90)
        self.server.index(text4server)
        return sims

Example #14

0

Show file

File: QueryIndex.py Project: gopigof/Search-Engine

class QueryIndex:
    def __init__(self):
        self.service = SessionServer('SearchServer/')
        self.search_results = []

    def query(self, user_query):
        doc = {'tokens': utils.simple_preprocess(user_query)}
        results = self.service.find_similar(doc, min_score=0.4, max_results=50)
        self.search_results = results

    def return_results(self):
        return self.search_results

Example #15

0

Show file

File: QueryIndex.py Project: NarciScrap/Search-Engine

class QueryIndex:

    def __init__(self):
        self.service = SessionServer('SearchServer/')
        self.search_results = []

    def query(self, user_query):
        doc = {'tokens': utils.simple_preprocess(user_query)}
        results = self.service.find_similar(doc, min_score=0.4, max_results=50)
        self.search_results = results

    def return_results(self):
        return self.search_results

Example #16

0

Show file

File: indexer.py Project: sauravbiswasiupr/recommender-lsi

class Indexer(object):
    def __init__(self):
        self.server = SessionServer("./tmp")

    def _create_corpus(self, texts):
        corpus = []
        for id, text in texts:
            corpus.append({'id': id, 'tokens': utils.simple_preprocess(text)})
        return corpus

    def index(self, texts):
        corpus = self._create_corpus(texts)
        utils.upload_chunked(self.server, corpus, chunksize=1000)
        self.server.train(corpus, method='lsi')
        self.server.index(corpus)

    def add_documents(self, texts):
        self.index(texts)

    def recommend(self, id, max_results=10):
        print "Id is: ", id
        return self.server.find_similar(id, max_results=max_results)

Example #17

0

Show file

File: Temp_MachineLearning_R&D.py Project: trentniemeyer/BlogParse

def gensimsimserverII ():

    reloadData = True
    useremoteserver = False

    if (useremoteserver):
        server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
    else:
        server = SessionServer('/tmp/testserver') #SessionServer('myserver')


    if (reloadData):
        client = Elasticsearch([Util.config['eshost']])

        # response = client.search(
        #             index="blogs",
        #             body={
        #                 "size": "5000",
        #                 "query": {
        #                   "match": {
        #                     "country": country
        #                   }
        #                 }
        #             }
        #         )

        response = client.search(
                index="blogs",
                body={
                    "size": "5000",
                    "query": {"match_all": {}}
                }
            )

        stops = [unicode(word) for word in stopwords.words('english')] + [u':-).', u'–', u'-', u'…', '!!!', '!!', 'x', 'got', 'get', 'went', 'us', u'i\'m', '&','it\'s', 'i\'ve' ]
        corpus = []
        for hit in response['hits']['hits']:
            try:
                body = hit["_source"]["body"]
                id = hit["_source"]["url"]
                title = hit["_source"]["title"]
                newBody = [word for word in body.lower().split() if word not in stops]

                corpus.append({
                    'id': id,
                    'tokens':newBody,
                    'title':title
                })

                server.stable.payload[id] = title

            except Exception:
                logger.exception("Couldn't parse blog id: {0}".format(hit["_id"]))

        server.train(corpus, method='lsi')
        server.index(corpus)

    print "********************************************"
    print(server.find_similar('http://www.travelpod.com/travel-blog-entries/bvrlymm/1/1428224775/tpod.html', max_results=5))

Example #18

0

Show file

File: functions.py Project: fizzy123/archives

def index_nodes():
    print "loading server"
    service = SessionServer('/mnt/hgfs/Shared/my_server/')
    print "loading model"
    service.open_session()
    service.session.drop_index()
    service.session.model = simserver.SimModel.load("/mnt/hgfs/Shared/wiki")
    print service.session.model
    print "loading nodes"
    nodes = Node.objects.all()
    print "Building corpus"
    corpus = [{'id':node.pk,'tokens':re.findall(r"[\w']+",node.question.lower())} for node in nodes]
    print "indexing corpus"
    service.index(corpus)
    print service.stable.keys
    service.commit()

Example #19

0

Show file

File: views.py Project: c0cky/MIRCSearch

def ajax(request):
    service = SessionServer('/Users/camron/Desktop/MIRCSearch/thesite/simdatabase')
    data = json.loads(request.body)
    print "DATA: \n"
    print data
    results = service.find_similar(data['identifier'], max_results=13)
    print results
    screen = []
    temp = []
    address = ""
    beggining = '/static/mirc/Thumbnails/'
    jpg = '.jpg'
    for i in range(0, len(results)):
        temp = results[i][2]
        address = beggining + results[i][0] + jpg
        temp['imgAdr'] = address
        screen.append(temp)
    a = Assemble(screen, data['width'], data['height'])
    a.do_the_work()
    finished = a.to_list()
    print "AJAX \n"
    print json.dumps(finished)
    return HttpResponse(json.dumps(finished), content_type = "application/json")

Example #20

0

Show file

File: views.py Project: c0cky/MIRCSearch

def search(request):
    #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
    service = SessionServer('/Users/camron/Desktop/MIRCSearch/thesite/simdatabase')
    form = SearchForm(request.POST)
    if form.is_valid() == False:
        return HttpResponse()
    search = form.cleaned_data['search']
    height = int(request.POST['height'])
    width = int(request.POST['width'])
    print height, " x ", width
    if form.is_valid():
        doc = {'tokens': utils.simple_preprocess(search)}
        print doc
        data = service.find_similar(doc, max_results=13)
        test = len(data)
        if test == 0:
            return HttpResponseRedirect('/mirc/noresults')
#run data through the circle thingy to get the positions in there.
	screen = []
    temp = []
    address = ""
    beggining = '/static/mirc/Thumbnails/'
    jpg = '.jpg'
    #p = '/home/cocky/thesite/mirc/static/mirc/Thumbnails/'
    for i in range(0, len(data)):
        temp = data[i][2]
        address =  beggining + data[i][0] + jpg
        #address = 'http://www.extremetech.com/wp-content/uploads/2013/08/bitcoin1.jpg'
        temp['imgAdr'] = address
        screen.append(temp)
    a = Assemble(screen, width, height)
    a.do_the_work()
    finished = a.to_list()
    print finished
    form = SearchForm()
    return render(request, 'mirc/dashboard.html',{'data':json.dumps(finished),'form':form})

Example #21

0

Show file

File: SearchServer.py Project: tehstone/Search-Engine

class SearchServer:

    def __init__(self):
        self.service = SessionServer('SearchServer/')
    

    def generate_index(self):
        def page_text():
            for page_file in os.listdir('CrawlData'):
                content = open('CrawlData/'+page_file, 'r')
                page_content = content.read()
                content.close()
                page_url = re.sub('\s', '/', page_file)
                yield page_url, page_content
        corpus = [{'id': '%s' % url, 'tokens': utils.simple_preprocess(text)}
                for url, text in page_text()]
        self.service.train(corpus, method='lsi')
        self.service.index(corpus)

    def query(self):
        user_string = raw_input('Enter query: ')
        doc = {'tokens': utils.simple_preprocess(user_string)}
        for results in self.service.find_similar(doc, min_score=0.4, max_results=50):
            print results[0]

Example #22

0

Show file

File: Temp_MachineLearning_R&D.py Project: trentniemeyer/BlogParse

def gensimsimserver ():
    server = SessionServer('myserver')
    texts = ["Human machine interface for lab abc computer applications",
          "A survey of user opinion of computer system response time",
          "The EPS user interface management system",
          "System and human system engineering testing of EPS",
          "Relation of user perceived response time to error measurement",
          "The generation of random binary unordered trees",
          "The intersection graph of paths in trees",
          "Graph minors IV Widths of trees and well quasi ordering",
          "Graph minors A survey"]
    corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)}
        for num, text in enumerate(texts)]
    server.train(corpus, method='lsi')
    server.index(corpus)
    print "********************************************"
    print(server.find_similar('doc_0'))

Example #23

0

Show file

File: GensimAllInOne.py Project: Darth-Neo/Gensim

def GensimClient(texts):
    similarities = None

    gsDir = os.getcwd()
    gss = gsDir + os.sep + u"gensim_server" + os.sep
    server = SessionServer(gss)

    logger.debug(u"%s" % server.status())

    try:
        corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)]

        # send 1k docs at a time
        utils.upload_chunked(server, corpus, chunksize=1000)

        server.train(corpus, method=u"lsi")

        # index the same documents that we trained on...
        server.index(corpus)

        similarities = findSimilar(texts, server, corpus)

    except Exception, msg:
        logger.debug(u"%s" % msg)

Example #24

0

Show file

File: indexer.py Project: sauravbiswasiupr/recommender-lsi

 def __init__(self):
     self.server = SessionServer("./tmp")

Example #25

0

Show file

File: functions.py Project: fizzy123/archives

        while n.content[0:3] == '-->':
            if n.content[3:5] == '*.':
                if Tag.objects.filter(title=n.content[5:]).exists():
                    tag = Tag.objects.get(title=n.content[5:])
                    if tag.node_set.all().exists():
                        n = choice(tag.node_set.all())
            else:
                if Node.objects.filter(title=n.content[3::]).exists():
                    n = Node.objects.get(title=n.content[3::])
                else:
                    log.debug('%s not found' % name)
                    n = Node.objects.get(title='idk')
        context = {'reply': parse_content(n.content, 'display'), 'title':n.title}
        return json_response(context), n

service = SessionServer('/mnt/hgfs/Shared/my_server/')
service.stable.model = simserver.SimModel.load("/mnt/hgfs/Shared/wiki")

def parse(arguments, method):
    name = arguments['name']
    if method == 'GET':
        n = None
        while not n:
            matches = service.find_similar({'tokens':re.findall(r"[\w']+", name)},.9)
            if len(matches):
                n = Node.objects.get(pk=matches[0][0])
            else:    
                matches = service.find_similar({'tokens':re.findall(r"[\w']+",name)},.8)
                if len(matches):
                    n = Node.objects.get(pk=matches[0][0])
                else:

Example #26

0

Show file

File: docsimserver.py Project: ConsumerAffairs/django-document-similarity

class DocSimServer(object):
    def __init__(self):
        self.server = SessionServer(settings.SIMSERVER_WORKING_DIR)
        if not self.server.stable.model:
            self.server.train(self.corpus)
        if not self.server.stable.fresh_index:
            self.server.index(self.corpus)

    def find_similar(self, *args, **kwargs):
        return self.server.find_similar(*args, **kwargs)

    @property
    def corpus(self):
        try:
            return self._corpus
        except AttributeError:
            logging.info('creating corpus from DB')
            self._corpus = [dict(id=doc.id, tokens=doc.tokens())
                            for doc in Document.objects.all()]
            return self._corpus

    @property
    def document_ids(self):
        try:
            return self._document_ids
        except AttributeError:
            self._document_ids = list(
                Document.objects.values_list('id', flat=True).order_by('id'))
            return self._document_ids

    @property
    def index_id(self):
        try:
            return self._index_id
        except AttributeError:
            self._index_id = dict(enumerate(self.document_ids))
            return self._index_id

    @property
    def id_index(self):
        try:
            return self._id_index
        except AttributeError:
            self._id_index = dict((v, k) for k, v in self.index_id.iteritems())
            return self._id_index

    def similarity_matrix(self):
        logging.info('calculating similarity matrix')
        s = identity(len(self.id_index))
        for id in self.document_ids:
            for sim_id, score, none in self.server.find_similar(
                    id, min_score=.2, max_results=10000):
                if sim_id != id:
                    s[self.id_index[id]][self.id_index[sim_id]] = score
        return s

    @property
    def distance_matrix(self):
        try:
            return self._distance_matrix
        except AttributeError:
            s = self.similarity_matrix()
            logging.info('converting similarity matrix to distance matrix')
            self._distance_matrix = 2 * (1 - s)
            return self._distance_matrix

    def dbscan_clusters(self, eps=.4, min_samples=5):
        D = self.distance_matrix
        logging.info('starting dbscan')
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
        db = dbscan.fit(D)
        labels = db.labels_
        clusters = [l for l in set(labels) if l > 0]  # outliers are -1
        logging.info('found %i clusters' % len(clusters))
        for c in clusters:
            cluster = Cluster(
                parameters=dict(algorithm='DBSCAN', eps=eps,
                                min_samples=min_samples))
            cluster.save()
            doc_ids = [self.index_id[i[0]] for i in argwhere(labels == c)]
            logging.info(
                'cluster %s: %s documents' % (cluster.id, len(doc_ids)))
            cluster.documents.add(*doc_ids)

Example #27

0

Show file

File: SearchServer.py Project: NarciScrap/Search-Engine

 def __init__(self):
     self.service = SessionServer('SearchServer/')

Example #28

0

Show file

File: Simple_resume_similarity_app_tk.py Project: DataScience86/python

    def resume_scoring(self):
        """"
            Cleanes the data and runs the resume matching code. User is
            requested to pass the job description name, session_name and
            final output file name. Final output is an excel file.
 
            @param: job_description - string
            @param: session_name - string
            @param: output_filename - string
 
            Once you run this code it will prompt you to select the path of the
            directory           
        """
 
 
        self.job_description = self.select_job_description()
        if len(self.job_description) > 0:
 
            #self.job_description_path = os.path.join( self.job_description_path + "/" + job_description)
 
            self.raw_resumes_path =self.select_resume_path()
            if len(self.raw_resumes_path) > 0:               
                self.save_text_files_path = self.select_rawtext_path()
 
                self.raw_resumes_to_text()
                self.jd_to_text()
 
                self.file_list_text = glob.glob(self.save_text_files_path + "/*.*")
                print self.file_list_text
 
                self.resume_id = []
                for i in range(0, len(self.file_list_text)):
                    self.resume_id.append([int(s) for s in self.file_list_text[i].split() if s.isdigit()])
 
                self.documents = []
                for filename in self.file_list_text:
                    with open(filename, 'r') as f:
                        #d = f.read()
                        #print d
                        self.documents.append(f.read())
 
                self.corpus = [{'id': 'doc_%s' % num, 'tokens': utils.simple_preprocess(text)}
                  for num, text in enumerate(self.documents)]
 
                self.count = 0
                while self.count < len(self.resume_id):   
                    for item in self.corpus:
                        if self.resume_id[self.count] == []:
                            item['id'] = 'doc_jd'
                        else:
                            item['id'] = str(self.resume_id[self.count])
                        self.count =  self.count + 1
 
                self.regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
                self.tokenized_corpus_no_punctuation = []
 
                for review in self.corpus:       
                    self.new_corpus = []
                    for token in review:
                        self.new_token = self.regex.sub(u'', token)
                        if not self.new_token == u'':
                            self.new_corpus.append(self.new_token)       
                    self.tokenized_corpus_no_punctuation.append(self.new_corpus)
 
                self.dir_name = self.setting_up_server_session_dir()       
                self.server = SessionServer(self.dir_name)
 
                logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
                self.server.train(self.corpus, method='lsi')
                self.server.index(self.corpus)
                self.lst = self.server.find_similar('doc_jd')
                self.series = pd.DataFrame(self.lst)
                self.series.columns = ['Resume_ID', 'Score', 'none']
                self.series.index.names = ['Rank']
 
                self.series = self.series.drop(self.series.columns[2], axis = 1)       
                self.final_excel_path()

Example #29

0

Show file

#coding=utf-8

from simserver import SessionServer
server = SessionServer('/tmp/my_server')  # resume server (or create a new one)

Example #30

0

Show file

import os
from gensim import utils
from simserver import SessionServer

server = SessionServer('myserver')
w = open('data/1/549518.txt').read()

docin = {'id': '549518', 'tokens' : utils.simple_preprocess(w)}

print server.find_similar(docin)

Example #31

0

Show file

File: test_GensimClient.py Project: Darth-Neo/Gensim

def test_Gensim(texts):
    gsDir = os.getcwd()
    logger.debug(u"GSDir %s" % gsDir)

    gss = gsDir + os.sep + u"gensim_server" + os.sep
    logger.debug(u"%s" % gss)

    server = SessionServer(gss)

    u""" texts = [u"Human machine interface for lab abc computer applications",
             u"A survey of user opinion of computer system response time",
             u"The EPS user interface management system",
             u"System and human system engineering testing of EPS",
             u"Relation of user perceived response time to error measurement",
             u"The generation of random binary unordered trees",
             u"The intersection graph of paths in trees",
             u"Graph minors IV Widths of trees and well quasi ordering",
             u"Graph minors A survey",
             u"Why use a computer"]
    """

    logger.info(u"%s" % server.status())

    corpus = [{u"id": u"doc_%i" % num, u"tokens": utils.simple_preprocess(text)} for num, text in enumerate(texts)]

    # send 1k docs at a time
    utils.upload_chunked(server, corpus, chunksize=1000)

    server.train(corpus, method=u"lsi")

    # index the same documents that we trained on...
    server.index(corpus)

    # supply a list of document ids to be removed from the index
    # server.delete(["doc_5", "doc_8"])

    # overall index size unchanged (just 3 docs overwritten)
    server.index(corpus[:3])

    # Option Ons
    for n in range(0, len(texts)):
        doc = u"doc_%d" % n
        logger.info(u"Find similar doc_%d to %s" % (n, corpus[n][u"tokens"]))
        for sim in server.find_similar(doc):
            m = int(sim[0][-1:])
            if m != n:
                logger.info(u"\t%s \t %3.2f : %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"]))

                d = [unicode(x) for x in corpus[n][u"tokens"]]
                e = [unicode(y) for y in corpus[m][u"tokens"]]

                s1 = set(e)
                s2 = set(d)
                common = s1 & s2
                lc = [x for x in common]
                logger.info(u"\tCommon Topics : %s\n" % (lc))

    if False:
        # Option two
        doc = {u"tokens": utils.simple_preprocess(u"Graph and minors and humans and trees.")}
        logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))

Example #32

0

Show file

File: gensim_tests.py Project: scilear/text_similarities

 def __init__(self):
     self.server = SessionServer(r'c:\temp\data_server')
     print self.server

Example #33

0

Show file

# -*- coding: utf-8 -*-
"""
Created on Mon Sep 10 14:34:49 2018

@author: afcarl
"""

from gensim import utils
from simserver import SessionServer

import gensim

#server = SessionServer('/tmp/my_server') # resume server (or create a new one)
server = SessionServer('./my_server') # resume server (or create a new one)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

logger = logging.getLogger('gensim.similarities.simserver')

document = {'id': 'some_unique_string',
            'tokens': ['content', 'of', 'the', 'document', '...'],
            'other_fields_are_allowed_but_ignored': None}

from gensim import utils
texts = ["Human machine interface for lab abc computer applications",
         "A survey of user opinion of computer system response time",
         "The EPS user interface management system",
         "System and human system engineering testing of EPS",
         "Relation of user perceived response time to error measurement",
         "The generation of random binary unordered trees",

Example #34

0

Show file

File: similarity_server.py Project: zkgan/ideas-flask

import json
from bson import json_util
from bson.objectid import ObjectId
from flask import Flask, request
from mongokit import Document
from flask.ext.pymongo import PyMongo
import datetime
from simserver import SessionServer
from gensim import utils
import itertools
from pymongo import MongoClient

sim_server = SessionServer('./tmp/idea_match_server')
client = MongoClient('localhost', 3001)
db = client.meteor
cursor = db.ideas.find({})
corpus = [{
    'id': idea['_id'],
    'tokens': utils.simple_preprocess(idea['text'])
} for idea in cursor]
utils.upload_chunked(sim_server, corpus, chunksize=1000)
sim_server.train(corpus, method='lsi')
sim_server.index(corpus)

app = Flask(__name__)
app.config['MONGO_HOST'] = 'localhost'
app.config['MONGO_PORT'] = 3001
app.config['MONGO_DBNAME'] = 'meteor'
mongo = PyMongo(app)

Example #35

0

Show file

 def __init__(self):
     self.server = SessionServer(r'c:\temp\data_server')
     print self.server

Example #36

0

Show file

def GensimClient(texts):
    gsDir = os.getcwd()
    logger.debug(u"GSDir %s" % gsDir)

    gss = gsDir + os.sep + u"gensim_server" + os.sep
    logger.debug(u"%s" % gss)

    server = SessionServer(gss)

    logger.info(u"%s" % server.status())

    corpus = [{u"id": u"url_%i" % n, u"tokens": utils.simple_preprocess(text)} for n, text in enumerate(texts)]

    # send 1k docs at a time
    utils.upload_chunked(server, corpus, chunksize=1000)

    server.train(corpus, method=u"lsi")

    # index the same documents that we trained on...
    server.index(corpus)

    # supply a list of document ids to be removed from the index
    # server.delete(["doc_5", "doc_8"])

    # overall index size unchanged (just 3 docs overwritten)
    server.index(corpus[:3])

    # Option Ons
    for n in range(0, len(corpus)):
        doc = u"doc_%d" % n
        logger.info(u"------------------------------------------------------")
        logger.info(u"Find similar N doc_%d to %s" % (n, corpus[n][u"tokens"]))
        logger.info(u"------------------------------------------------------")
        for sim in server.find_similar(doc):
            m = int(sim[0][-1:])
            if m != n:
                logger.info(u"\t%s \t %3.2f : M %s" % (sim[0], float(sim[1]), corpus[m][u"tokens"]))

                d = [unicode(x) for x in corpus[n][u"tokens"]]
                e = [unicode(y) for y in corpus[m][u"tokens"]]

                s1 = set(e)
                s2 = set(d)
                common = s1 & s2
                lc = [x for x in common]
                logger.info(u"\t\tCommon Topics : %s" % (lc))

    if False:
        # Option two
        doc = {u"tokens": utils.simple_preprocess(str("Graph and minors and humans and trees."))}
        logger.info(u"%s" % server.find_similar(doc, min_score=0.4, max_results=50))

Example #37

0

Show file

File: QueryIndex.py Project: NarciScrap/Search-Engine

 def __init__(self):
     self.service = SessionServer('SearchServer/')
     self.search_results = []

Example #38

0

Show file

File: SearchServer.py Project: zymITsky/Search-Engine

 def __init__(self):
     self.service = SessionServer('SearchServer/')

Example #39

0

Show file

File: QueryIndex.py Project: gopigof/Search-Engine

 def __init__(self):
     self.service = SessionServer('SearchServer/')
     self.search_results = []

Example #40

0

Show file

File: simserver_test.py Project: jannson/Similar

        doc['id'] = 'html_%d' % obj.id
        doc['tokens'] = list(Tokenize(obj.content))
        if obj.id % 1000 == 0:
            print 'processing', obj.id
        yield doc

def iter_corpus():
    for obj in SogouCorpus.objects.all():
        doc = {}
        doc['id'] = 'sogou_%d' % obj.id
        doc['tokens'] = obj.tokens.split(',')
        if obj.id % 1000 == 0:
            print 'processing', obj.id
        yield doc

server = SessionServer('/tmp/server')
#server = Pyro4.Proxy(Pyro4.locateNS().lookup('gensim.testserver'))
def train_server():
    training_corpus = iter_documents()
    #training_corpus = iter_corpus()
    #server.train(list(training_corpus), method='lsi')
    #print 'train finished'
    server.index(training_corpus)
    print 'index finished'
    server.optimize()
    print 'optimize finished'

def update_keywords():
    for html in HtmlContent.objects.filter(~Q(retry=3)).filter(~Q(content='')):
        html.tags,html.summerize = summarize(html.content)
        html.summerize = html.summerize[0:388]

Example #41

0

Show file

File: indexer.py Project: sauravbiswasiupr/recommender-lsi

 def __init__(self):
     self.server = SessionServer("./tmp")

Example #42

0

Show file

File: SemEvalCQA-lsi-wikipedia.py Project: yeladlouni/SemevalCQA

            tokens = preprocessor.tokenize(qtext)
            tokens = map(preprocessor.deNoise, tokens)
            devocalize_tokens = map(preprocessor.removeDiacritics, tokens)
            denoised_tokens = map(preprocessor.deNoise, devocalize_tokens)
            normalized_tokens = map(preprocessor.normalizeAlef,
                                    denoised_tokens)
            normalized_tokens = map(preprocessor.normalizeAggressive,
                                    normalized_tokens)
            lemmatized_tokens = map(preprocessor.lemmatize, normalized_tokens)

            yield LabeledSentence(words=[w for w in tokens], tags=['%s' % qid])


from simserver import SessionServer

service = SessionServer('tmp/')

service.train(corpus, method='lsi')

import sys


class QuestionPairSimilarity(object):
    def __iter__(self):

        qs = LabeledQuestion('input/SemEval2016-Task3-CQA-MD-test.xml')
        for q in qs:

            service.drop_index()
            qid = q.tags[0]
            print qid

Example #43

0

Show file

import os
from gensim import utils
from simserver import SessionServer


def buildCorpus():

    corpus = []
    for d in os.listdir('data'):
        if not d == '0':
            continue
        cnt = os.listdir('data/' + d)
        i = 0
        for f in os.listdir('data/' + d):
            document = open('data/' + d + '/' + f).read()
            pmcid = f.split('.')[0]
            docin = {'id': pmcid, 'tokens': utils.simple_preprocess(document)}
            corpus.append(docin)
    return corpus


corpus = buildCorpus()

server = SessionServer('myserver')

#server.train(corpus,method='lsi')
server.index(corpus)

Example #44

0

Show file

File: simserver_run_local_F2.py Project: eric-erki/GITHUB2VEC

    'v3', 'v4', 'v5', 'v9', 'w', 'x', 'z'
]

i_tag_num_threshold = 5

#===========================
#===========================
i_1000_flag = 1
#i_1000_flag = 0
#===========================
#===========================

#server = SessionServer('/tmp/my_server') # resume server (or create a new one)
#server = SessionServer('./my_server') # resume server (or create a new one)
#server = SessionServer('./my_server_A') # resume server (or create a new one)
server = SessionServer(folder_A)  # resume server (or create a new one)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

logger = logging.getLogger('gensim.similarities.simserver')


def load_words():
    with open('words_alpha.txt') as word_file:
        valid_words = set(word_file.read().split())

    return valid_words

Example #45

0

Show file

File: 000_gensim_document_server.py Project: abeusher/data-processing

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import utils
from simserver import SessionServer
service = SessionServer('c:/temp/gensim')  # or wherever


def index_input_texts():
    texts = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]
    corpus = [{'id': 'doc_%i' % num, 'tokens': utils.simple_preprocess(text)}
              for num, text in enumerate(texts)]
    # service.index(corpus)
    service.train(corpus, method='lsi')
    service.index(corpus)  # index the same documents that we trained on...

def query_the_index(input):
    doc = {'tokens': utils.simple_preprocess(input)}

Example #46

0

Show file

File: simservice.py Project: cleder/restsims

class SimService(object):

    def __init__(self, path, preprocess, deaccent=True, lowercase=True,
        stemmer=None, stopwords=None):
        self.service = SessionServer(path)
        self.deaccent = deaccent
        self.lowercase = lowercase
        self.preprocess = preprocess
        self.stemmer = stemmer
        self.stopwords = stopwords

    def find_similar(self, data, min_score, max_results):
        if isinstance(data, basestring):
            doc = data.strip()
            if ' ' in doc:
                doc = {'tokens': self.preprocess(data, deacc=self.deaccent,
                    lowercase=self.lowercase, errors='ignore',
                    stemmer=self.stemmer, stopwords=self.stopwords)}
            try:
                return {'status': 'OK', 'response':
                                    self.service.find_similar(doc,
                                    min_score=min_score,
                                    max_results=max_results)}
            except ValueError:
                return {'status': 'NOTFOUND', 'response':[]}
        else:
            result = {}
            for doc in data:
                try:
                    result[doc] = (self.service.find_similar(
                                    doc,
                                    min_score=min_score,
                                    max_results=max_results))
                except ValueError:
                    pass
            if result:
                return {'status': 'OK', 'response': result}
            else:
                return {'status': 'NOTFOUND', 'response':[]}

    def _buffer(self, data):
        i = 0
        for d in data:
            if 'tokens' in d:
                self.service.buffer([{'id': d['id'], 'tokens': d['tokens']}])
            else:
                self.service.buffer([{'id': d['id'],
                    'tokens': list(self.preprocess(d['text'], deacc=self.deaccent,
                    lowercase=self.lowercase, errors='ignore',
                    stemmer=self.stemmer, stopwords=self.stopwords))}])
            i+=1
        return i

    def train(self, data):
        self.service.set_autosession(False)
        self.service.open_session()
        i = self._buffer(data)
        self.service.train(method='lsi')
        logger.info('training complete commit changes')
        self.service.commit()
        self.service.set_autosession(True)
        return {'status': 'OK', 'response':i}

    def index(self, data):
        self.service.set_autosession(False)
        self.service.open_session()
        i = self._buffer(data)
        self.service.index()
        logger.info('indexing complete commit changes')
        self.service.commit()
        self.service.set_autosession(True)
        return {'status': 'OK', 'response':i}

    def optimize(self):
        self.service.set_autosession(False)
        self.service.open_session()
        self.service.optimize()
        self.service.commit()
        self.service.set_autosession(True)
        return {'status': 'OK', 'response': 'index optimized'}

    def delete(self, data):
        self.service.set_autosession(False)
        self.service.open_session()
        self.service.delete(data)
        self.service.commit()
        self.service.set_autosession(True)
        return {'status': 'OK', 'response': 'documents deleted'}

    def status(self):
        return {'status': 'OK', 'response': self.service.status()}

    def indexed_documents(self):
        return {'status': 'OK', 'response': self.service.keys()}

    def is_indexed(self, doc):
        return {'status': 'OK', 'response': doc in self.service.keys()}

Example #47

0

Show file

File: Simple_resume_similarity_app_tk.py Project: DataScience86/python

class Simple_resume_similarity_app_tk(Tkinter.Tk):
 
    def __init__(self):
        Tkinter.Tk.__init__(self)
        self.initialize()
 
    def initialize(self):
 
        button = Tkinter.Button(self,text=u"Click Me!", command = self.resume_scoring)
        button.grid(row = 1, column = 1)
 
 
        self.label1 = Tkinter.Label(self, text = "Click Button To Generate Similarity Score")
        self.label1.grid(row =  2, column = 1)
 
        #self.img = Image.open('C:\\temp\\Resume_Similarity\\Resume_GUI\\wellsfargologo2.gif')
        #self.img_path = r"C:/temp/Resume_Similarity/Resume_GUI/wellsfargologo2.gif"
        #self.im = Image.open(self.img_path)
        #self.ph = PIL.ImageTk.PhotoImage(self.im)
 
        #self.label1 = Label(self, image=self.ph)
        #self.label1.image = self.ph
        #self.label1.pack(side = "left")
 
        #logo = PhotoImage("C:/temp/Resume_Similarity/Resume_match_score/logo.jpg")
        #label.config(image = logo)
 
    def resume_scoring(self):
        """"
            Cleanes the data and runs the resume matching code. User is
            requested to pass the job description name, session_name and
            final output file name. Final output is an excel file.
 
            @param: job_description - string
            @param: session_name - string
            @param: output_filename - string
 
            Once you run this code it will prompt you to select the path of the
            directory           
        """
 
 
        self.job_description = self.select_job_description()
        if len(self.job_description) > 0:
 
            #self.job_description_path = os.path.join( self.job_description_path + "/" + job_description)
 
            self.raw_resumes_path =self.select_resume_path()
            if len(self.raw_resumes_path) > 0:               
                self.save_text_files_path = self.select_rawtext_path()
 
                self.raw_resumes_to_text()
                self.jd_to_text()
 
                self.file_list_text = glob.glob(self.save_text_files_path + "/*.*")
                print self.file_list_text
 
                self.resume_id = []
                for i in range(0, len(self.file_list_text)):
                    self.resume_id.append([int(s) for s in self.file_list_text[i].split() if s.isdigit()])
 
                self.documents = []
                for filename in self.file_list_text:
                    with open(filename, 'r') as f:
                        #d = f.read()
                        #print d
                        self.documents.append(f.read())
 
                self.corpus = [{'id': 'doc_%s' % num, 'tokens': utils.simple_preprocess(text)}
                  for num, text in enumerate(self.documents)]
 
                self.count = 0
                while self.count < len(self.resume_id):   
                    for item in self.corpus:
                        if self.resume_id[self.count] == []:
                            item['id'] = 'doc_jd'
                        else:
                            item['id'] = str(self.resume_id[self.count])
                        self.count =  self.count + 1
 
                self.regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
                self.tokenized_corpus_no_punctuation = []
 
                for review in self.corpus:       
                    self.new_corpus = []
                    for token in review:
                        self.new_token = self.regex.sub(u'', token)
                        if not self.new_token == u'':
                            self.new_corpus.append(self.new_token)       
                    self.tokenized_corpus_no_punctuation.append(self.new_corpus)
 
                self.dir_name = self.setting_up_server_session_dir()       
                self.server = SessionServer(self.dir_name)
 
                logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
                self.server.train(self.corpus, method='lsi')
                self.server.index(self.corpus)
                self.lst = self.server.find_similar('doc_jd')
                self.series = pd.DataFrame(self.lst)
                self.series.columns = ['Resume_ID', 'Score', 'none']
                self.series.index.names = ['Rank']
 
                self.series = self.series.drop(self.series.columns[2], axis = 1)       
                self.final_excel_path()
 
 
    def setting_up_server_session_dir(self):
        self.dir = 'C:/temp/resume_server_script_server_logs'
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        else:
            shutil.rmtree(self.dir)           #removes all the subdirectories!
            os.makedirs(self.dir)
        return self.dir
 
 
    def convert(self,fname, pages=None):
        if not pages:
            pagenums = set()
        else:
            pagenums = set(pages)         
        output = StringIO()       
        manager = PDFResourceManager()       
        converter = TextConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)       
        infile = file(fname, 'rb')       
        for page in PDFPage.get_pages(infile, pagenums):
            interpreter.process_page(page)
        infile.close()
        converter.close()
        text = output.getvalue()
        output.close
        return text
 
    def select_job_description(self):
        root = Tkinter.Tk()
        root.withdraw() #use to hide tkinter window
 
        currdir = os.getcwd()
        self.tempdir = tkFileDialog.askopenfilename(parent=root,
                                            initialdir=currdir,
                                            title="Select Job Description file")
        if len(self.tempdir) > 0:
            return self.tempdir
 
    def select_resume_path(self):
        root = Tkinter.Tk()
        root.withdraw() #use to hide tkinter window
 
        currdir = os.getcwd()
        tempdir = tkFileDialog.askdirectory(parent=root,
                                            initialdir=currdir,
                                            title="Select Resume Description Path")
        if len(tempdir) > 0:
            return tempdir
 
    def select_rawtext_path(self):
        root = Tkinter.Tk()
        root.withdraw() #use to hide tkinter window
 
        currdir = os.getcwd()
        tempdir = tkFileDialog.askdirectory(parent=root,
                                            initialdir=currdir,
                                            title="Select Path Where You Want To Save Text Files.")
        if len(tempdir) > 0:
            return tempdir
 
    def final_excel_path(self):
        root = Tkinter.Tk()
        root.withdraw() #use to hide tkinter window
 
        currdir = os.getcwd()
        savefile  = tkFileDialog.asksaveasfilename(filetypes=(("Excel files", "*.xlsx"),
                                                         ("All files", "*.*") ),
                                            parent=root,
                                            initialdir=currdir,
                                            title="Final Excel Output Path")
        if len(savefile) > 0:
            self.series.to_excel(savefile + ".xlsx", index=True, sheet_name="Results")        
 
 
 
    def raw_resumes_to_text(self):
        ## Reading the files path
        file_list_raw = glob.glob(self.raw_resumes_path + "/*.*")
        for fp in file_list_raw:
        # print fp
            ext = os.path.splitext(fp)[-1].lower()
            base = os.path.basename(fp)
            file_name = os.path.splitext(base)[0]   
 
            #print ext
            if ext == ".docx":       
                text = textract.process(fp)
                complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt")        
                with open(complte_name, 'w') as f:
                    f.write(text)
 
            elif ext == ".pdf":       
                text = self.convert(fp)
                complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt")        
                with open(complte_name, 'w') as f:
                    f.write(text)
            elif ext == ".txt":
                shutil.copy(os.path.join(self.raw_resumes_path + str("/") + file_name + ".txt"), os.path.join(self.save_text_files_path + str("/") + file_name + ".txt"))
            else:       
                print "Unable to recognise this format."
 
    def jd_to_text(self):
 
        ext = os.path.splitext(self.job_description)[-1].lower()
        file_name_with_ext = os.path.basename(self.job_description)
        file_name = os.path.splitext(file_name_with_ext)[0].lower()
 
        if ext == ".docx":
            text = textract.process(self.job_description)
            complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt")        
            with open(complte_name, 'w') as f:
                f.write(text)
 
        elif ext == ".pdf":       
            text = convert(self.job_description)
            complte_name = os.path.join(self.save_text_files_path + "/" + file_name + ".txt")        
            with open(complte_name, 'w') as f:
                f.write(text)
 
        elif ext == ".txt":
            shutil.copy(self.job_description, os.path.join(self.save_text_files_path + str("/") + file_name + ".txt"))
 
        else:
            print "This file format is not supported for now."

Example #48

0

Show file

File: server.py Project: Truong-Julie/similarityServer

from flask import Flask
from flask import json
from flask import request
from flask import Response
import os
app = Flask(__name__)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

from gensim import utils

from simserver import SessionServer
#BEFORE TRAINING NEW MODEL - CHANGE PATH BELOW
service = SessionServer('/tmp/mirFlickr4500')

# FORMAT FOR DATA POSTED TO /index: {"id":NUMBER,"tokens":["STRING","STRING","STRING"]}


@app.route('/test', methods=['GET'])
def test():
    return "server is running"


@app.route('/index', methods=['POST'])
def indexPhoto():
    print(request.json)
    service.index(request.json)
    return "Recieved: " + json.dumps(request.json)

Example #49

0

Show file

File: gensim_indexing.py Project: goeastagent/recdoc

import os
from gensim import utils
from simserver import SessionServer

def buildCorpus():
    
    corpus = []
    for d in os.listdir('data'):
        if not d == '0':
            continue
        cnt = os.listdir('data/'+d)
        i = 0
        for f in os.listdir('data/'+d):
            document = open('data/'+d+'/'+f).read()
            pmcid = f.split('.')[0]
            docin = {'id' : pmcid,
                     'tokens' : utils.simple_preprocess(document)
            }
            corpus.append(docin)
    return corpus

corpus = buildCorpus()

server = SessionServer('myserver')

#server.train(corpus,method='lsi')
server.index(corpus)

Example #50

0

Show file

#an example by Steven Du, showing how to use this server for Chinese documents

# train: let the server learn the LSI model
# index: setup your own pool of documents that you want the query to search 
# find_similar : find the similar documents in the indexed pool of documents.
# Input to this server (train,index,find_similar) is a list of {'id': 'doc_%i' % num, 'tokens': text.split()}


from simserver import SessionServer
import codecs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

serverFilePath='./temp_index_dir'

server = SessionServer(serverFilePath) # resume server (or create a new one)


texts=['如果 也 没有 的话 。 这个 确实 没有 办法 了 。 我 个人 建议您 重装 一遍 这个 软件 看看 是否 还是 一样 卸载 程序 里 也 没有 呢',
'我能 直接 删掉 这些 文件 吗 ？',
'不 建议 呢 。 因为 不 确定 这些 文件 中 是否 有 其他软件 的 文件 呢',
'好 的 ， 使用 看看 会断 么',
'它 只是 有时 自动 掉 ， 以后 看看 怎么样',
'这个 是 您 无线 驱动 ： http : / / driverdl . lenovo . com . cn / lenovo / driverfilesuploadfloder / 32228 / wlan _ win8 . 1 . exe',
'要是 问题 还是 出现 您 可以 安装 这个 试试',
'10 几个 版本 都 试过 了 么',
'目前 可以 确认 08 版本 以上 正常 运行',
'这个 是 电源 吧',
'http : / / weixin . lenovo . com . cn / img / files / user _ files / olhctjgaid22zzdnezguwbxzuxrq / voice / 16 _ 03 _ 17 / 1104209 _ 729724 _ 1458213046 . jpg',
'现在 不是 运行 问题 ， 是 安装 问题',
'点 电源 卸载 没 反应 呢',

Example #51

0

Show file

File: gensim_similarity.py Project: goeastagent/recdoc

import os
from gensim import utils
from simserver import SessionServer

server = SessionServer("myserver")
w = open("data/1/549518.txt").read()

docin = {"id": "549518", "tokens": utils.simple_preprocess(w)}

print server.find_similar(docin)

Example #52

0

Show file

File: similarity_server.py Project: zkgan/ideas-flask

import json
from bson import json_util
from bson.objectid import ObjectId
from flask import Flask, request
from mongokit import Document
from flask.ext.pymongo import PyMongo
import datetime
from simserver import SessionServer
from gensim import utils
import itertools
from pymongo import MongoClient

sim_server = SessionServer('./tmp/idea_match_server')
client = MongoClient('localhost', 3001)
db = client.meteor
cursor = db.ideas.find({})
corpus = [{'id': idea['_id'], 'tokens': utils.simple_preprocess(idea['text'])} for idea in cursor]
utils.upload_chunked(sim_server, corpus, chunksize=1000)
sim_server.train(corpus, method='lsi')
sim_server.index(corpus)

app = Flask(__name__)
app.config['MONGO_HOST'] = 'localhost'
app.config['MONGO_PORT'] = 3001
app.config['MONGO_DBNAME'] = 'meteor'
mongo = PyMongo(app)


class Idea(Document):
    structure = {
        'text':unicode,

Example #53

0

Show file

File: docsimserver.py Project: ConsumerAffairs/django-document-similarity

 def __init__(self):
     self.server = SessionServer(settings.SIMSERVER_WORKING_DIR)
     if not self.server.stable.model:
         self.server.train(self.corpus)
     if not self.server.stable.fresh_index:
         self.server.index(self.corpus)