def loadWordVectors(wv_fname, redir_fname): global wordvectors wordvectors = WordVectors( fname=wv_fname, #"/data/matthew/enwiki-20141208-pages-articles-multistream-links7-output1.bin", redir_fname=redir_fname, #'/data/matthew/enwiki-20141208-pages-articles-multistream-redirect7.json', negvectors=False, sentence_length=200, ) wordvectors.add_unknown_words = False
def loadWordVectors(wv_fname, redir_fname): global wordvectors wordvectors = WordVectors( fname= wv_fname, #"/data/matthew/enwiki-20141208-pages-articles-multistream-links7-output1.bin", redir_fname= redir_fname, #'/data/matthew/enwiki-20141208-pages-articles-multistream-redirect7.json', negvectors=False, sentence_length=200, ) wordvectors.add_unknown_words = False
import re theano.config.floatX = 'float32' #theano.config.linker = 'cvm_nogc' theano.config.openmp = True theano.config.openmp_elemwise_minsize = 20000 # In[3]: with open('/data/matthew/external-wiki2.json') as f: queries = json.load(f)['queries'] wordvectors = WordVectors( fname= "/data/matthew/enwiki-20141208-pages-articles-multistream-links5-output1.bin", redir_fname= '/data/matthew/enwiki-20141208-pages-articles-multistream-redirects5.json', negvectors=False, sentence_length=200, ) wordvectors.add_unknown_words = False page_redirects = wordvectors.redirects from wikireader import WikiRegexes, WikipediaReader def PreProcessedQueries(wikipedia_dump_fname, wordvec=wordvectors, queries=queries, redirects=page_redirects):
import re theano.config.floatX = 'float32' #theano.config.linker = 'cvm_nogc' theano.config.openmp = True theano.config.openmp_elemwise_minsize = 20000 # In[3]: with open('/data/matthew/external-wiki2.json') as f: queries = json.load(f)['queries'] wordvectors = WordVectors( fname="/data/matthew/enwiki-20141208-pages-articles-multistream-links7-output1.bin", redir_fname='/data/matthew/enwiki-20141208-pages-articles-multistream-redirect7.json', negvectors=False, sentence_length=200, ) wordvectors.add_unknown_words = False page_redirects = wordvectors.redirects from wikireader import WikiRegexes, WikipediaReader def PreProcessedQueries(wikipedia_dump_fname, wordvec=wordvectors, queries=queries, redirects=page_redirects): get_words = re.compile('[^a-zA-Z0-9 ]') get_link = re.compile('.*?\[(.*?)\].*?')