Esempio n. 1
0
import re
import sys
import json
import boto.utils
import time
import traceback
from random import random
from multiprocessing import Pool
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from boto.exception import S3ResponseError
from nlp_client.caching import useCaching
from nlp_client.services import *

useCaching(writeOnly=True)

workers = int(sys.argv[1])
services = sys.argv[2] if len(sys.argv) > 2 else 'services-config.json'

services = json.loads(open(services).read())['services']
BUCKET = S3Connection().get_bucket('nlp-data')

def call_services(keyname):
    global BUCKET

    key = BUCKET.get_key(keyname)
    if key is None:
        return

    SIG = "%s_%s_%s" % (boto.utils.get_instance_metadata()['local-hostname'], str(time.time()), str(int(random()*100)))
from nlp_client.services import WpTopEntitiesService
from nlp_client.caching import useCaching
import sys

useCaching()

print WpTopEntitiesService().nestedGet(sys.argv[1])
import requests
import sys
import traceback
from collections import defaultdict
from identify_wiki_subjects import identify_subject
from multiprocessing import Pool
from nlp_client.caching import useCaching
from nlp_client.services import TopEntitiesService
from wiki_recommender import as_euclidean, get_topics_sorted_keys

# Specify how many of the top wikis to iterate over
top_n = int(sys.argv[1])

SOLR_URL = 'http://dev-search.prod.wikia.net:8983/solr/xwiki/select'

useCaching(dontCompute=True)

log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
fh = logging.FileHandler('name_lda_topics.log')
fh.setLevel(logging.ERROR)
log.addHandler(fh)
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
log.addHandler(sh)

# Jaccard functions taken from https://github.com/mouradmourafiq/data-analysis
def jaccard_sim(tup_1, tup_2, verbose=False):
    """Calculate the Jaccard similiarity of 2 tuples"""
    sum = len(tup_1) + len(tup_2)
    set_1 = set(tup_1)
Esempio n. 4
0
parser = OptionParser()
parser.add_option('-s',
                  '--service',
                  dest='service',
                  default=None,
                  help="The Service.method you want to purge")
parser.add_option('-d',
                  '--doc_id',
                  dest='doc_id',
                  default=None,
                  help="The doc id you want to purge responses for")
parser.add_option('-w',
                  '--wiki_id',
                  dest='wiki_id',
                  default=None,
                  help="The wiki id you want to purge responses for")

(options, args) = parser.parse_args()

if not options.service and not options.doc_id and not options.wiki_id:
    raise ValueError("Need to specify a type of purge")

caching.useCaching()

if options.service:
    caching.purgeCacheForService(options.service)
elif options.doc_id:
    caching.purgeCacheForDoc(options.doc_id)
elif options.wiki_id:
    caching.purgeCacheForWiki(options.wiki_id)
Esempio n. 5
0
import sys
import time
from nlp_client.services import RedirectsService, AllTitlesService
from nlp_client.caching import useCaching

start = time.time()
useCaching(writeOnly=True)
titles = AllTitlesService().nestedGet(sys.argv[1])
redirects = RedirectsService().nestedGet(sys.argv[1])
print "Finished %s in %d seconds (%d titles, %d redirects)" % (
    sys.argv[1], int(time.time() - start), len(titles), len(redirects))
def getData(wid):
    useCaching(perServiceCaching={'TopEntitiesService.get': {'dont_compute':True}, 'HeadsCountService.get': {'dont_compute':True}})
    return [(wid, [HeadsCountService().nestedGet(wid), TopEntitiesService().nestedGet(wid)])]
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
from math import sqrt
import gensim
from sklearn.svm import SVC
import os
from nlp_client.services import WikiPageEntitiesService, WikiEntitiesService, WpWikiPageEntitiesService, TopEntitiesService, HeadsCountService
from nlp_client.caching import useCaching
import sys
import requests

topN = sys.argv[1]

num_topics = int(sys.argv[2])

useCaching(perServiceCaching={'TopEntitiesService.get': {'dont_compute':True}, 'HeadsCountService.get': {'dont_compute':True}})

wids = [str(int(line)) for line in open('topwams.txt').readlines()][:int(topN)]


def vec2dense(vec, num_terms):

    '''Convert from sparse gensim format to dense list of numbers'''
    return list(gensim.matutils.corpus2dense([vec], num_terms=num_terms).T[0])

entities = []
entities = dict([(wid, [HeadsCountService().nestedGet(wid), TopEntitiesService().nestedGet(wid)]) for wid in wids])

widToEntityList = {}
for wid in entities:
    widToEntityList[wid] = []
Esempio n. 8
0
def getData(wid):
    useCaching(perServiceCaching={'TopEntitiesService.get': {'dont_compute':True}, 'HeadsCountService.get': {'dont_compute':True}})
    return [(wid, [HeadsCountService().nestedGet(wid), TopEntitiesService().nestedGet(wid)])]
Esempio n. 9
0
from nlp_client.caching import useCaching
import json
import sys

app = Flask(__name__)
api = restful.Api(app)

api.add_resource(ParsedXmlService, '/doc/<string:doc_id>/xml')
api.add_resource(ParsedJsonService, '/doc/<string:doc_id>/json')
api.add_resource(AllNounPhrasesService, '/doc/<string:doc_id>/nps')
api.add_resource(AllVerbPhrasesService, '/doc/<string:doc_id>/vps')
api.add_resource(HeadsService, '/doc/<string:doc_id>/heads')
api.add_resource(CoreferenceCountsService, '/doc/<string:doc_id>/corefs')
api.add_resource(SolrPageService, '/doc/<string:doc_id>/solr')
api.add_resource(SentimentService, '/doc/<string:doc_id>/sentiment')
api.add_resource(EntitiesService, '/doc/<string:doc_id>/entities')
api.add_resource(EntityCountsService, '/doc/<string:doc_id>/entity_counts')
api.add_resource(SolrWikiService, '/wiki/<string:wiki_id>/solr')
api.add_resource(WikiEntitiesService, '/wiki/<string:wiki_id>/entities')
api.add_resource(
    ListDocIdsService,
    '/wiki/<string:wiki_id>/docs/')  #todo: get start & offset working
api.add_resource(TopEntitiesService, '/wiki/<string:wiki_id>/top_entities')
api.add_resource(HeadsCountService, '/wiki/<string:wiki_id>/head_counts')
api.add_resource(TopHeadsService, '/wiki/<string:wiki_id>/top_heads')

if __name__ == '__main__':
    if len(sys.argv) > 1:
        useCaching()
    app.run(debug=True, host='0.0.0.0')
Esempio n. 10
0
'''
Allows different dimentionalities of cache purging.
'''
from nlp_client import caching
from optparse import OptionParser

parser = OptionParser()
parser.add_option('-s', '--service', dest='service', default=None,
                  help="The Service.method you want to purge")
parser.add_option('-d', '--doc_id', dest='doc_id', default=None,
                  help="The doc id you want to purge responses for")
parser.add_option('-w', '--wiki_id', dest='wiki_id', default=None,
                  help="The wiki id you want to purge responses for")

(options, args) = parser.parse_args()


if not options.service and not options.doc_id and not options.wiki_id:
    raise ValueError("Need to specify a type of purge")

caching.useCaching()

if options.service:
    caching.purgeCacheForService(options.service)
elif options.doc_id:
    caching.purgeCacheForDoc(options.doc_id)
elif options.wiki_id:
    caching.purgeCacheForWiki(options.wiki_id)