Example #1
0
def loadWordVectors(wv_fname, redir_fname):
    global wordvectors
    wordvectors = WordVectors(
        fname=wv_fname, #"/data/matthew/enwiki-20141208-pages-articles-multistream-links7-output1.bin",
        redir_fname=redir_fname, #'/data/matthew/enwiki-20141208-pages-articles-multistream-redirect7.json',
        negvectors=False,
        sentence_length=200,
    )
    wordvectors.add_unknown_words = False
Example #2
0
def loadWordVectors(wv_fname, redir_fname):
    global wordvectors
    wordvectors = WordVectors(
        fname=
        wv_fname,  #"/data/matthew/enwiki-20141208-pages-articles-multistream-links7-output1.bin",
        redir_fname=
        redir_fname,  #'/data/matthew/enwiki-20141208-pages-articles-multistream-redirect7.json',
        negvectors=False,
        sentence_length=200,
    )
    wordvectors.add_unknown_words = False
import re

theano.config.floatX = 'float32'
#theano.config.linker = 'cvm_nogc'
theano.config.openmp = True
theano.config.openmp_elemwise_minsize = 20000

# In[3]:

with open('/data/matthew/external-wiki2.json') as f:
    queries = json.load(f)['queries']

wordvectors = WordVectors(
    fname=
    "/data/matthew/enwiki-20141208-pages-articles-multistream-links5-output1.bin",
    redir_fname=
    '/data/matthew/enwiki-20141208-pages-articles-multistream-redirects5.json',
    negvectors=False,
    sentence_length=200,
)
wordvectors.add_unknown_words = False

page_redirects = wordvectors.redirects

from wikireader import WikiRegexes, WikipediaReader


def PreProcessedQueries(wikipedia_dump_fname,
                        wordvec=wordvectors,
                        queries=queries,
                        redirects=page_redirects):
import re

theano.config.floatX = 'float32'
#theano.config.linker = 'cvm_nogc'
theano.config.openmp = True
theano.config.openmp_elemwise_minsize = 20000


# In[3]:

with open('/data/matthew/external-wiki2.json') as f:
    queries = json.load(f)['queries']

wordvectors = WordVectors(
    fname="/data/matthew/enwiki-20141208-pages-articles-multistream-links7-output1.bin",
    redir_fname='/data/matthew/enwiki-20141208-pages-articles-multistream-redirect7.json',
    negvectors=False,
    sentence_length=200,
)
wordvectors.add_unknown_words = False

page_redirects = wordvectors.redirects


from wikireader import WikiRegexes, WikipediaReader


def PreProcessedQueries(wikipedia_dump_fname, wordvec=wordvectors, queries=queries, redirects=page_redirects):

    get_words = re.compile('[^a-zA-Z0-9 ]')
    get_link = re.compile('.*?\[(.*?)\].*?')