import re import sys import json import boto.utils import time import traceback from random import random from multiprocessing import Pool from boto.s3.connection import S3Connection from boto.s3.key import Key from boto.exception import S3ResponseError from nlp_client.caching import useCaching from nlp_client.services import * useCaching(writeOnly=True) workers = int(sys.argv[1]) services = sys.argv[2] if len(sys.argv) > 2 else 'services-config.json' services = json.loads(open(services).read())['services'] BUCKET = S3Connection().get_bucket('nlp-data') def call_services(keyname): global BUCKET key = BUCKET.get_key(keyname) if key is None: return SIG = "%s_%s_%s" % (boto.utils.get_instance_metadata()['local-hostname'], str(time.time()), str(int(random()*100)))
from nlp_client.services import WpTopEntitiesService from nlp_client.caching import useCaching import sys useCaching() print WpTopEntitiesService().nestedGet(sys.argv[1])
import requests import sys import traceback from collections import defaultdict from identify_wiki_subjects import identify_subject from multiprocessing import Pool from nlp_client.caching import useCaching from nlp_client.services import TopEntitiesService from wiki_recommender import as_euclidean, get_topics_sorted_keys # Specify how many of the top wikis to iterate over top_n = int(sys.argv[1]) SOLR_URL = 'http://dev-search.prod.wikia.net:8983/solr/xwiki/select' useCaching(dontCompute=True) log = logging.getLogger(__name__) log.setLevel(logging.INFO) fh = logging.FileHandler('name_lda_topics.log') fh.setLevel(logging.ERROR) log.addHandler(fh) sh = logging.StreamHandler() sh.setLevel(logging.INFO) log.addHandler(sh) # Jaccard functions taken from https://github.com/mouradmourafiq/data-analysis def jaccard_sim(tup_1, tup_2, verbose=False): """Calculate the Jaccard similiarity of 2 tuples""" sum = len(tup_1) + len(tup_2) set_1 = set(tup_1)
parser = OptionParser() parser.add_option('-s', '--service', dest='service', default=None, help="The Service.method you want to purge") parser.add_option('-d', '--doc_id', dest='doc_id', default=None, help="The doc id you want to purge responses for") parser.add_option('-w', '--wiki_id', dest='wiki_id', default=None, help="The wiki id you want to purge responses for") (options, args) = parser.parse_args() if not options.service and not options.doc_id and not options.wiki_id: raise ValueError("Need to specify a type of purge") caching.useCaching() if options.service: caching.purgeCacheForService(options.service) elif options.doc_id: caching.purgeCacheForDoc(options.doc_id) elif options.wiki_id: caching.purgeCacheForWiki(options.wiki_id)
import sys import time from nlp_client.services import RedirectsService, AllTitlesService from nlp_client.caching import useCaching start = time.time() useCaching(writeOnly=True) titles = AllTitlesService().nestedGet(sys.argv[1]) redirects = RedirectsService().nestedGet(sys.argv[1]) print "Finished %s in %d seconds (%d titles, %d redirects)" % ( sys.argv[1], int(time.time() - start), len(titles), len(redirects))
def getData(wid): useCaching(perServiceCaching={'TopEntitiesService.get': {'dont_compute':True}, 'HeadsCountService.get': {'dont_compute':True}}) return [(wid, [HeadsCountService().nestedGet(wid), TopEntitiesService().nestedGet(wid)])]
import warnings warnings.filterwarnings('ignore', category=DeprecationWarning) from math import sqrt import gensim from sklearn.svm import SVC import os from nlp_client.services import WikiPageEntitiesService, WikiEntitiesService, WpWikiPageEntitiesService, TopEntitiesService, HeadsCountService from nlp_client.caching import useCaching import sys import requests topN = sys.argv[1] num_topics = int(sys.argv[2]) useCaching(perServiceCaching={'TopEntitiesService.get': {'dont_compute':True}, 'HeadsCountService.get': {'dont_compute':True}}) wids = [str(int(line)) for line in open('topwams.txt').readlines()][:int(topN)] def vec2dense(vec, num_terms): '''Convert from sparse gensim format to dense list of numbers''' return list(gensim.matutils.corpus2dense([vec], num_terms=num_terms).T[0]) entities = [] entities = dict([(wid, [HeadsCountService().nestedGet(wid), TopEntitiesService().nestedGet(wid)]) for wid in wids]) widToEntityList = {} for wid in entities: widToEntityList[wid] = []
from nlp_client.caching import useCaching import json import sys app = Flask(__name__) api = restful.Api(app) api.add_resource(ParsedXmlService, '/doc/<string:doc_id>/xml') api.add_resource(ParsedJsonService, '/doc/<string:doc_id>/json') api.add_resource(AllNounPhrasesService, '/doc/<string:doc_id>/nps') api.add_resource(AllVerbPhrasesService, '/doc/<string:doc_id>/vps') api.add_resource(HeadsService, '/doc/<string:doc_id>/heads') api.add_resource(CoreferenceCountsService, '/doc/<string:doc_id>/corefs') api.add_resource(SolrPageService, '/doc/<string:doc_id>/solr') api.add_resource(SentimentService, '/doc/<string:doc_id>/sentiment') api.add_resource(EntitiesService, '/doc/<string:doc_id>/entities') api.add_resource(EntityCountsService, '/doc/<string:doc_id>/entity_counts') api.add_resource(SolrWikiService, '/wiki/<string:wiki_id>/solr') api.add_resource(WikiEntitiesService, '/wiki/<string:wiki_id>/entities') api.add_resource( ListDocIdsService, '/wiki/<string:wiki_id>/docs/') #todo: get start & offset working api.add_resource(TopEntitiesService, '/wiki/<string:wiki_id>/top_entities') api.add_resource(HeadsCountService, '/wiki/<string:wiki_id>/head_counts') api.add_resource(TopHeadsService, '/wiki/<string:wiki_id>/top_heads') if __name__ == '__main__': if len(sys.argv) > 1: useCaching() app.run(debug=True, host='0.0.0.0')
''' Allows different dimentionalities of cache purging. ''' from nlp_client import caching from optparse import OptionParser parser = OptionParser() parser.add_option('-s', '--service', dest='service', default=None, help="The Service.method you want to purge") parser.add_option('-d', '--doc_id', dest='doc_id', default=None, help="The doc id you want to purge responses for") parser.add_option('-w', '--wiki_id', dest='wiki_id', default=None, help="The wiki id you want to purge responses for") (options, args) = parser.parse_args() if not options.service and not options.doc_id and not options.wiki_id: raise ValueError("Need to specify a type of purge") caching.useCaching() if options.service: caching.purgeCacheForService(options.service) elif options.doc_id: caching.purgeCacheForDoc(options.doc_id) elif options.wiki_id: caching.purgeCacheForWiki(options.wiki_id)