Ejemplo n.º 1
0
def write_text(wid):
    try:
        last_indexed_value = open(os.path.join(
            DATA_DIR, 'last_indexed.txt')).read().strip()
    except IOError:
        last_indexed_value = '2000-01-01T12:00:00.000Z'
    text_dir = os.path.join(DATA_DIR, 'text', str(wid))
    query = 'wid:%s AND iscontent:true' % str(wid)
    if last_indexed:
        query += ' AND indexed:["%s" TO *]' % last_indexed_value
    qi = QueryIterator(get_config(), {
        'query': query,
        'fields': 'pageid, html_en, indexed',
        'sort': 'pageid asc'
    })
    print 'Writing text from %s to file...' % str(wid)
    for doc in qi:
        pageid = doc['pageid']
        text = '\n'.join(clean_list(doc.get('html_%s' % language, '')))
        last_indexed_value = max(last_indexed_value, doc.get('indexed'))
        text_subdir = os.path.join(text_dir, str(pageid)[0])
        if not os.path.exists(text_subdir):
            os.makedirs(text_subdir)
        text_filepath = os.path.join(text_subdir, str(pageid))
        with open(text_filepath, 'w') as text_file:
            text_file.write(text)
    with open(os.path.join(DATA_DIR, 'last_indexed.txt'),
              'w') as last_indexed_file:
        last_indexed_file.write(last_indexed_value)
    return text_dir
Ejemplo n.º 2
0
def write_files(wid):
    filepath = os.path.join('html_en', str(wid))
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    qi = QueryIterator(
        get_config(), {
            'query': 'wid:%s AND iscontent:true' % str(wid),
            'fields': 'pageid, html_en',
            'sort': 'id asc'
        })
    for doc in qi:
        print 'extracting words for %i_%i...' % (wid, doc['pageid'])
        page_file = GzipFile(os.path.join(filepath, '%s.gz' % doc['pageid']),
                             'w',
                             compresslevel=9)
        page_file.write(doc.get('html_en', '').lower().encode('utf-8'))
        page_file.close()
Ejemplo n.º 3
0
 def extract_words(self, wid):
     """Updates db with previously unseen words and lemmas, and page unigrams"""
     words_file = gzip.open(self.words_file, 'a')
     page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w')
     w = WordPunctTokenizer()
     qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'})
     print 'starting extraction for wiki %s...' % str(wid)
     for doc in qi:
         print 'extracting words for %s...' % doc['id']
         page_file.write('\t%s\n' % doc['pageid'])
         for word in w.tokenize(doc.get('html_en', '').lower()):
             if word not in self.words:
                 self.words[word] = self.counter
                 words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8')))
                 self.counter += 1
             page_file.write('%i\n' % self.words.get(word, 0))
     page_file.close()
     words_file.close()
Ejemplo n.º 4
0
def write_batch_files():
    qi = QueryIterator(get_config(), {'query': "wid:298117 AND -title_en:\"File:IGN Weekly 'Wood\" is_video:true", 'fields': 'id,title_en,video_description_txt,video_keywords_txt,video_actors_txt,video_tags_txt,video_genres_txt'})

    doc_count = 0
    batch_count = 0

    #tempdir = tempfile.mkdtemp()
    tempdir = '/home/tristan/temp/video'

    for doc in qi:
        if doc_count % 100 == 0:
            batch_count += 1
            filepath = os.path.join(tempdir, str(batch_count))
            if not os.path.exists(filepath):
                os.makedirs(filepath)
        #no_parse = []
        #for field in [u'video_keywords_txt', u'video_tags_txt', u'video_actors_txt', u'video_genres_txt']:
        #    for tag in doc.get(field, []):
        #        no_parse.append(tag)
        #text = '\t'.join(list(set(no_parse))) + '\n'
        #for field in [u'title_en', u'video_description_txt']:
        #    val = doc.get(field, None)
        #    if val:
        #        text += as_string(val)
        fields = []
        for field in doc:
            if field != u'id':
                val = doc.get(field, None)
                if val:
                    fields.append(as_string(doc[field]))
        text = '.\n'.join(fields)
        print text
        output_file = open(os.path.join(filepath, doc[u'id']), 'w')
        output_file.write(text.encode('utf-8'))
        output_file.close()
        doc_count += 1
    return tempdir
Ejemplo n.º 5
0
    def write_batch_files(self):
        qi = QueryIterator(
            get_config(), {
                'query': 'wid:%s AND iscontent:true' % self.wid,
                'fields': 'id,url,html_en',
                'sort': 'id asc'
            })

        doc_count = 0
        batch_count = 0

        for doc in qi:
            if doc_count % 100 == 0:
                batch_count += 1
                filepath = os.path.join(self.tempdir, str(batch_count))
                if not os.path.exists(filepath):
                    os.makedirs(filepath)
            text = normalize(as_string(doc.get('html_en', '')))
            if text != '':
                print 'writing %s to %s' % (doc['id'], filepath)
                output_file = open(os.path.join(filepath, doc['id']), 'w')
                output_file.write(text.encode('utf-8'))
                output_file.close()
            doc_count += 1
Ejemplo n.º 6
0
from WikiaSolr import QueryIterator

qi = QueryIterator(
    'http://search-s10:8983/solr/xwiki/', {
        'query': 'lang_s:en',
        'fields': 'id',
        'sort': 'wam_i desc',
        'start': 0,
        'limit': 1000
    })

f = open('top1k', 'w')

for doc in qi:
    f.write('%s\n' % doc['id'])

f.close()
Ejemplo n.º 7
0
import sys, json, requests, re
import time  #test
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from WikiaSolr import QueryIterator, get_config, as_string, expand_entities, WikiaDomainLoader
from WikiaSolr.StanfordParser import ParserService
import codecs

wid = sys.argv[1]

qi = QueryIterator(get_config(), {
    'query': 'wid:%s AND iscontent:true' % wid,
    'fields': 'id,url,html_en'
})
#qi = QueryIterator(get_config(), {'query': 'id:3125_199499', 'fields':'id,url,html_en' })
#qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'filterquery': 'views:[3500000 TO *]', 'fields':'id,url,html_en,views' }) #test

service = ParserService()

config = json.loads("".join(open('worker-config.json').readlines()))
host = config["common"]["solr_endpoint"]

entities = {}
confirmed_entities = {}

p = PunktSentenceTokenizer()

doc_count = 0

bullet1 = '\xe2\x80\xa2'.decode('utf-8')
bullet2 = '\xc2\xb7'.decode('utf-8')
Ejemplo n.º 8
0
import json
import requests
from WikiaSolr import QueryIterator

SOLR = 'http://search-s10:8983/solr/xwiki/'

qi = QueryIterator(SOLR, {'query': 'lang_s:en', 'fields': 'id, hostname_s', 'sort': 'id asc'})

d = {}

for wiki in qi:
    try:
        print wiki['id'], wiki['hostname_s']
    except:
        print 'unicode error'
    d[int(wiki['id'])] = wiki['hostname_s']

with open('hostnames.json', 'w') as f:
    f.write(json.dumps(d))
Ejemplo n.º 9
0
start_time = time()

wid = sys.argv[1]
TOP_N = 5


def remove_newlines(text):
    if '\n' in text:
        return ' '.join([line for line in text.split('\n') if '.' in line])
    return text


qi = QueryIterator(
    get_config(), {
        'query': 'wid:%s AND confirmed_entities_txt:*' % wid,
        'fields': 'id,url,html_en,confirmed_entities_txt'
    })

for doc in qi:
    entity_tally = defaultdict(int)
    confirmed_entities = [
        entity.lower() for entity in doc.get('confirmed_entities_txt', [])
    ]
    html = urllib2.urlopen(doc['url']).read()
    soup = BeautifulSoup(html)
    text = ' '.join([p.text for p in soup.find_all('p')])
    sentences = filter(not_infobox, [
        remove_newlines(sentence)
        for sentence in PunktSentenceTokenizer().tokenize(text)
    ])
Ejemplo n.º 10
0
from WikiaSolr import QueryIterator

qi = QueryIterator('http://dev-search:8983/solr/xwiki/', {
    'query': 'lang_s:en',
    'fields': 'id',
    'sort': 'wam_i desc',
    'limit': 1500
})

with open('/data/top1500.txt', 'w') as f:
    for doc in qi:
        f.write(doc['id'] + '\n')
Ejemplo n.º 11
0
import sys, re, json, requests
from operator import itemgetter
from WikiaSolr import QueryIterator, get_config

wid = sys.argv[1]

config = json.loads(''.join(open('worker-config.json').readlines()))
host = config['common']['solr_endpoint']

qi = QueryIterator(get_config(), {'query': 'wid:%s AND suspected_entities_txt:*' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt'})
#qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt', 'filterquery': 'views:[2000000 TO *]'})

suspected_entities = {}
confirmed_entities = {}

def hasalpha(string):
    import string as s
    if string:
        for letter in s.lowercase:
            if letter in string:
                return True
    return False

def normalize(string):
    string = re.sub(u'[^\w\s]|_', u' ', string.lower())
    string = re.sub(u' {2,}', u' ', string)
    return string.strip()

count = 0

for doc in qi:
Ejemplo n.º 12
0
import re
from WikiaSolr import QueryIterator
from boto.s3.connection import S3Connection
from boto.s3.key import Key

qi = QueryIterator('http://search-s10:8983/solr/xwiki/', {
    'query': 'lang_s:en',
    'fields': 'id',
    'sort': 'wam_i desc',
    'wt': 'json'
})

bucket = S3Connection().get_bucket('nlp-data')

titles = {}
for key in bucket.list(prefix='article_titles/'):
    if key.name.endswith('gz'):
        wid = int(re.search('/([0-9]+)\.gz', key.name).group(1))
        print 'Adding %i to titles dict...' % wid
        titles[wid] = True

redirects = {}
for key in bucket.list(prefix='article_redirects/'):
    if key.name.endswith('gz'):
        wid = int(re.search('/([0-9]+)\.gz', key.name).group(1))
        print 'Adding %i to redirects dict...' % wid
        redirects[wid] = True

missing_titles = open('missing_titles.txt', 'w')
missing_redirects = open('missing_redirects.txt', 'w')