def write_text(wid): try: last_indexed_value = open(os.path.join( DATA_DIR, 'last_indexed.txt')).read().strip() except IOError: last_indexed_value = '2000-01-01T12:00:00.000Z' text_dir = os.path.join(DATA_DIR, 'text', str(wid)) query = 'wid:%s AND iscontent:true' % str(wid) if last_indexed: query += ' AND indexed:["%s" TO *]' % last_indexed_value qi = QueryIterator(get_config(), { 'query': query, 'fields': 'pageid, html_en, indexed', 'sort': 'pageid asc' }) print 'Writing text from %s to file...' % str(wid) for doc in qi: pageid = doc['pageid'] text = '\n'.join(clean_list(doc.get('html_%s' % language, ''))) last_indexed_value = max(last_indexed_value, doc.get('indexed')) text_subdir = os.path.join(text_dir, str(pageid)[0]) if not os.path.exists(text_subdir): os.makedirs(text_subdir) text_filepath = os.path.join(text_subdir, str(pageid)) with open(text_filepath, 'w') as text_file: text_file.write(text) with open(os.path.join(DATA_DIR, 'last_indexed.txt'), 'w') as last_indexed_file: last_indexed_file.write(last_indexed_value) return text_dir
def write_files(wid): filepath = os.path.join('html_en', str(wid)) if not os.path.exists(filepath): os.makedirs(filepath) qi = QueryIterator( get_config(), { 'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'pageid, html_en', 'sort': 'id asc' }) for doc in qi: print 'extracting words for %i_%i...' % (wid, doc['pageid']) page_file = GzipFile(os.path.join(filepath, '%s.gz' % doc['pageid']), 'w', compresslevel=9) page_file.write(doc.get('html_en', '').lower().encode('utf-8')) page_file.close()
def extract_words(self, wid): """Updates db with previously unseen words and lemmas, and page unigrams""" words_file = gzip.open(self.words_file, 'a') page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w') w = WordPunctTokenizer() qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'}) print 'starting extraction for wiki %s...' % str(wid) for doc in qi: print 'extracting words for %s...' % doc['id'] page_file.write('\t%s\n' % doc['pageid']) for word in w.tokenize(doc.get('html_en', '').lower()): if word not in self.words: self.words[word] = self.counter words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8'))) self.counter += 1 page_file.write('%i\n' % self.words.get(word, 0)) page_file.close() words_file.close()
def write_batch_files(): qi = QueryIterator(get_config(), {'query': "wid:298117 AND -title_en:\"File:IGN Weekly 'Wood\" is_video:true", 'fields': 'id,title_en,video_description_txt,video_keywords_txt,video_actors_txt,video_tags_txt,video_genres_txt'}) doc_count = 0 batch_count = 0 #tempdir = tempfile.mkdtemp() tempdir = '/home/tristan/temp/video' for doc in qi: if doc_count % 100 == 0: batch_count += 1 filepath = os.path.join(tempdir, str(batch_count)) if not os.path.exists(filepath): os.makedirs(filepath) #no_parse = [] #for field in [u'video_keywords_txt', u'video_tags_txt', u'video_actors_txt', u'video_genres_txt']: # for tag in doc.get(field, []): # no_parse.append(tag) #text = '\t'.join(list(set(no_parse))) + '\n' #for field in [u'title_en', u'video_description_txt']: # val = doc.get(field, None) # if val: # text += as_string(val) fields = [] for field in doc: if field != u'id': val = doc.get(field, None) if val: fields.append(as_string(doc[field])) text = '.\n'.join(fields) print text output_file = open(os.path.join(filepath, doc[u'id']), 'w') output_file.write(text.encode('utf-8')) output_file.close() doc_count += 1 return tempdir
def write_batch_files(self): qi = QueryIterator( get_config(), { 'query': 'wid:%s AND iscontent:true' % self.wid, 'fields': 'id,url,html_en', 'sort': 'id asc' }) doc_count = 0 batch_count = 0 for doc in qi: if doc_count % 100 == 0: batch_count += 1 filepath = os.path.join(self.tempdir, str(batch_count)) if not os.path.exists(filepath): os.makedirs(filepath) text = normalize(as_string(doc.get('html_en', ''))) if text != '': print 'writing %s to %s' % (doc['id'], filepath) output_file = open(os.path.join(filepath, doc['id']), 'w') output_file.write(text.encode('utf-8')) output_file.close() doc_count += 1
from WikiaSolr import QueryIterator qi = QueryIterator( 'http://search-s10:8983/solr/xwiki/', { 'query': 'lang_s:en', 'fields': 'id', 'sort': 'wam_i desc', 'start': 0, 'limit': 1000 }) f = open('top1k', 'w') for doc in qi: f.write('%s\n' % doc['id']) f.close()
import sys, json, requests, re import time #test from nltk.tokenize import PunktSentenceTokenizer from nltk.corpus import stopwords from WikiaSolr import QueryIterator, get_config, as_string, expand_entities, WikiaDomainLoader from WikiaSolr.StanfordParser import ParserService import codecs wid = sys.argv[1] qi = QueryIterator(get_config(), { 'query': 'wid:%s AND iscontent:true' % wid, 'fields': 'id,url,html_en' }) #qi = QueryIterator(get_config(), {'query': 'id:3125_199499', 'fields':'id,url,html_en' }) #qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'filterquery': 'views:[3500000 TO *]', 'fields':'id,url,html_en,views' }) #test service = ParserService() config = json.loads("".join(open('worker-config.json').readlines())) host = config["common"]["solr_endpoint"] entities = {} confirmed_entities = {} p = PunktSentenceTokenizer() doc_count = 0 bullet1 = '\xe2\x80\xa2'.decode('utf-8') bullet2 = '\xc2\xb7'.decode('utf-8')
import json import requests from WikiaSolr import QueryIterator SOLR = 'http://search-s10:8983/solr/xwiki/' qi = QueryIterator(SOLR, {'query': 'lang_s:en', 'fields': 'id, hostname_s', 'sort': 'id asc'}) d = {} for wiki in qi: try: print wiki['id'], wiki['hostname_s'] except: print 'unicode error' d[int(wiki['id'])] = wiki['hostname_s'] with open('hostnames.json', 'w') as f: f.write(json.dumps(d))
start_time = time() wid = sys.argv[1] TOP_N = 5 def remove_newlines(text): if '\n' in text: return ' '.join([line for line in text.split('\n') if '.' in line]) return text qi = QueryIterator( get_config(), { 'query': 'wid:%s AND confirmed_entities_txt:*' % wid, 'fields': 'id,url,html_en,confirmed_entities_txt' }) for doc in qi: entity_tally = defaultdict(int) confirmed_entities = [ entity.lower() for entity in doc.get('confirmed_entities_txt', []) ] html = urllib2.urlopen(doc['url']).read() soup = BeautifulSoup(html) text = ' '.join([p.text for p in soup.find_all('p')]) sentences = filter(not_infobox, [ remove_newlines(sentence) for sentence in PunktSentenceTokenizer().tokenize(text) ])
from WikiaSolr import QueryIterator qi = QueryIterator('http://dev-search:8983/solr/xwiki/', { 'query': 'lang_s:en', 'fields': 'id', 'sort': 'wam_i desc', 'limit': 1500 }) with open('/data/top1500.txt', 'w') as f: for doc in qi: f.write(doc['id'] + '\n')
import sys, re, json, requests from operator import itemgetter from WikiaSolr import QueryIterator, get_config wid = sys.argv[1] config = json.loads(''.join(open('worker-config.json').readlines())) host = config['common']['solr_endpoint'] qi = QueryIterator(get_config(), {'query': 'wid:%s AND suspected_entities_txt:*' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt'}) #qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt', 'filterquery': 'views:[2000000 TO *]'}) suspected_entities = {} confirmed_entities = {} def hasalpha(string): import string as s if string: for letter in s.lowercase: if letter in string: return True return False def normalize(string): string = re.sub(u'[^\w\s]|_', u' ', string.lower()) string = re.sub(u' {2,}', u' ', string) return string.strip() count = 0 for doc in qi:
import re from WikiaSolr import QueryIterator from boto.s3.connection import S3Connection from boto.s3.key import Key qi = QueryIterator('http://search-s10:8983/solr/xwiki/', { 'query': 'lang_s:en', 'fields': 'id', 'sort': 'wam_i desc', 'wt': 'json' }) bucket = S3Connection().get_bucket('nlp-data') titles = {} for key in bucket.list(prefix='article_titles/'): if key.name.endswith('gz'): wid = int(re.search('/([0-9]+)\.gz', key.name).group(1)) print 'Adding %i to titles dict...' % wid titles[wid] = True redirects = {} for key in bucket.list(prefix='article_redirects/'): if key.name.endswith('gz'): wid = int(re.search('/([0-9]+)\.gz', key.name).group(1)) print 'Adding %i to redirects dict...' % wid redirects[wid] = True missing_titles = open('missing_titles.txt', 'w') missing_redirects = open('missing_redirects.txt', 'w')