def write_text(wid): try: last_indexed_value = open(os.path.join( DATA_DIR, 'last_indexed.txt')).read().strip() except IOError: last_indexed_value = '2000-01-01T12:00:00.000Z' text_dir = os.path.join(DATA_DIR, 'text', str(wid)) query = 'wid:%s AND iscontent:true' % str(wid) if last_indexed: query += ' AND indexed:["%s" TO *]' % last_indexed_value qi = QueryIterator(get_config(), { 'query': query, 'fields': 'pageid, html_en, indexed', 'sort': 'pageid asc' }) print 'Writing text from %s to file...' % str(wid) for doc in qi: pageid = doc['pageid'] text = '\n'.join(clean_list(doc.get('html_%s' % language, ''))) last_indexed_value = max(last_indexed_value, doc.get('indexed')) text_subdir = os.path.join(text_dir, str(pageid)[0]) if not os.path.exists(text_subdir): os.makedirs(text_subdir) text_filepath = os.path.join(text_subdir, str(pageid)) with open(text_filepath, 'w') as text_file: text_file.write(text) with open(os.path.join(DATA_DIR, 'last_indexed.txt'), 'w') as last_indexed_file: last_indexed_file.write(last_indexed_value) return text_dir
def write_files(wid): filepath = os.path.join('html_en', str(wid)) if not os.path.exists(filepath): os.makedirs(filepath) qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'pageid, html_en', 'sort': 'id asc'}) for doc in qi: print 'extracting words for %i_%i...' % (wid, doc['pageid']) page_file = GzipFile(os.path.join(filepath, '%s.gz' % doc['pageid']), 'w', compresslevel=9) page_file.write(doc.get('html_en', '').lower().encode('utf-8')) page_file.close()
def write_files(wid): filepath = os.path.join('html_en', str(wid)) if not os.path.exists(filepath): os.makedirs(filepath) qi = QueryIterator( get_config(), { 'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'pageid, html_en', 'sort': 'id asc' }) for doc in qi: print 'extracting words for %i_%i...' % (wid, doc['pageid']) page_file = GzipFile(os.path.join(filepath, '%s.gz' % doc['pageid']), 'w', compresslevel=9) page_file.write(doc.get('html_en', '').lower().encode('utf-8')) page_file.close()
def extract_words(self, wid): """Updates db with previously unseen words and lemmas, and page unigrams""" words_file = gzip.open(self.words_file, 'a') page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w') w = WordPunctTokenizer() qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'}) print 'starting extraction for wiki %s...' % str(wid) for doc in qi: print 'extracting words for %s...' % doc['id'] page_file.write('\t%s\n' % doc['pageid']) for word in w.tokenize(doc.get('html_en', '').lower()): if word not in self.words: self.words[word] = self.counter words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8'))) self.counter += 1 page_file.write('%i\n' % self.words.get(word, 0)) page_file.close() words_file.close()
def write_batch_files(self): qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % self.wid, 'fields':'id,url,html_en', 'sort': 'id asc'}) doc_count = 0 batch_count = 0 for doc in qi: if doc_count % 100 == 0: batch_count += 1 filepath = os.path.join(self.tempdir, str(batch_count)) if not os.path.exists(filepath): os.makedirs(filepath) text = normalize(as_string(doc.get('html_en', ''))) if text != '': print 'writing %s to %s' % (doc['id'], filepath) output_file = open(os.path.join(filepath, doc['id']), 'w') output_file.write(text.encode('utf-8')) output_file.close() doc_count += 1
def write_batch_files(): qi = QueryIterator(get_config(), {'query': "wid:298117 AND -title_en:\"File:IGN Weekly 'Wood\" is_video:true", 'fields': 'id,title_en,video_description_txt,video_keywords_txt,video_actors_txt,video_tags_txt,video_genres_txt'}) doc_count = 0 batch_count = 0 #tempdir = tempfile.mkdtemp() tempdir = '/home/tristan/temp/video' for doc in qi: if doc_count % 100 == 0: batch_count += 1 filepath = os.path.join(tempdir, str(batch_count)) if not os.path.exists(filepath): os.makedirs(filepath) #no_parse = [] #for field in [u'video_keywords_txt', u'video_tags_txt', u'video_actors_txt', u'video_genres_txt']: # for tag in doc.get(field, []): # no_parse.append(tag) #text = '\t'.join(list(set(no_parse))) + '\n' #for field in [u'title_en', u'video_description_txt']: # val = doc.get(field, None) # if val: # text += as_string(val) fields = [] for field in doc: if field != u'id': val = doc.get(field, None) if val: fields.append(as_string(doc[field])) text = '.\n'.join(fields) print text output_file = open(os.path.join(filepath, doc[u'id']), 'w') output_file.write(text.encode('utf-8')) output_file.close() doc_count += 1 return tempdir
def write_batch_files(self): qi = QueryIterator( get_config(), { 'query': 'wid:%s AND iscontent:true' % self.wid, 'fields': 'id,url,html_en', 'sort': 'id asc' }) doc_count = 0 batch_count = 0 for doc in qi: if doc_count % 100 == 0: batch_count += 1 filepath = os.path.join(self.tempdir, str(batch_count)) if not os.path.exists(filepath): os.makedirs(filepath) text = normalize(as_string(doc.get('html_en', ''))) if text != '': print 'writing %s to %s' % (doc['id'], filepath) output_file = open(os.path.join(filepath, doc['id']), 'w') output_file.write(text.encode('utf-8')) output_file.close() doc_count += 1
def write_text(wid): try: last_indexed_value = open(os.path.join(DATA_DIR, 'last_indexed.txt')).read().strip() except IOError: last_indexed_value = '2000-01-01T12:00:00.000Z' text_dir = os.path.join(DATA_DIR, 'text', str(wid)) query = 'wid:%s AND iscontent:true' % str(wid) if last_indexed: query += ' AND indexed:["%s" TO *]' % last_indexed_value qi = QueryIterator(get_config(), {'query': query, 'fields': 'pageid, html_en, indexed', 'sort': 'pageid asc'}) print 'Writing text from %s to file...' % str(wid) for doc in qi: pageid = doc['pageid'] text = '\n'.join(clean_list(doc.get('html_%s' % language, ''))) last_indexed_value = max(last_indexed_value, doc.get('indexed')) text_subdir = os.path.join(text_dir, str(pageid)[0]) if not os.path.exists(text_subdir): os.makedirs(text_subdir) text_filepath = os.path.join(text_subdir, str(pageid)) with open(text_filepath, 'w') as text_file: text_file.write(text) with open(os.path.join(DATA_DIR, 'last_indexed.txt'), 'w') as last_indexed_file: last_indexed_file.write(last_indexed_value) return text_dir
import sys, json, requests, re import time #test from nltk.tokenize import PunktSentenceTokenizer from nltk.corpus import stopwords from WikiaSolr import QueryIterator, get_config, as_string, expand_entities, WikiaDomainLoader from WikiaSolr.StanfordParser import ParserService import codecs wid = sys.argv[1] qi = QueryIterator(get_config(), { 'query': 'wid:%s AND iscontent:true' % wid, 'fields': 'id,url,html_en' }) #qi = QueryIterator(get_config(), {'query': 'id:3125_199499', 'fields':'id,url,html_en' }) #qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'filterquery': 'views:[3500000 TO *]', 'fields':'id,url,html_en,views' }) #test service = ParserService() config = json.loads("".join(open('worker-config.json').readlines())) host = config["common"]["solr_endpoint"] entities = {} confirmed_entities = {} p = PunktSentenceTokenizer() doc_count = 0 bullet1 = '\xe2\x80\xa2'.decode('utf-8') bullet2 = '\xc2\xb7'.decode('utf-8')
from nltk.tokenize.punkt import PunktSentenceTokenizer from WikiaSolr import QueryIterator, get_config from normalize import not_infobox from time import time start_time = time() wid = sys.argv[1] TOP_N = 5 def remove_newlines(text): if '\n' in text: return ' '.join([line for line in text.split('\n') if '.' in line]) return text qi = QueryIterator(get_config(), {'query': 'wid:%s AND confirmed_entities_txt:*' % wid, 'fields': 'id,url,html_en,confirmed_entities_txt'}) for doc in qi: entity_tally = defaultdict(int) confirmed_entities = [entity.lower() for entity in doc.get('confirmed_entities_txt', [])] html = urllib2.urlopen(doc['url']).read() soup = BeautifulSoup(html) text = ' '.join([p.text for p in soup.find_all('p')]) sentences = filter(not_infobox, [remove_newlines(sentence) for sentence in PunktSentenceTokenizer().tokenize(text)]) for (i, sentence) in enumerate(sentences): lowercase = sentence.lower() for entity in confirmed_entities: if entity in lowercase: entity_tally[i] += 1 if not entity_tally: summary = ' '.join(sentences[:TOP_N]).encode('utf-8')
from time import time start_time = time() wid = sys.argv[1] TOP_N = 5 def remove_newlines(text): if '\n' in text: return ' '.join([line for line in text.split('\n') if '.' in line]) return text qi = QueryIterator( get_config(), { 'query': 'wid:%s AND confirmed_entities_txt:*' % wid, 'fields': 'id,url,html_en,confirmed_entities_txt' }) for doc in qi: entity_tally = defaultdict(int) confirmed_entities = [ entity.lower() for entity in doc.get('confirmed_entities_txt', []) ] html = urllib2.urlopen(doc['url']).read() soup = BeautifulSoup(html) text = ' '.join([p.text for p in soup.find_all('p')]) sentences = filter(not_infobox, [ remove_newlines(sentence) for sentence in PunktSentenceTokenizer().tokenize(text)
import sys, re, json, requests from operator import itemgetter from WikiaSolr import QueryIterator, get_config wid = sys.argv[1] config = json.loads(''.join(open('worker-config.json').readlines())) host = config['common']['solr_endpoint'] qi = QueryIterator(get_config(), {'query': 'wid:%s AND suspected_entities_txt:*' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt'}) #qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt', 'filterquery': 'views:[2000000 TO *]'}) suspected_entities = {} confirmed_entities = {} def hasalpha(string): import string as s if string: for letter in s.lowercase: if letter in string: return True return False def normalize(string): string = re.sub(u'[^\w\s]|_', u' ', string.lower()) string = re.sub(u' {2,}', u' ', string) return string.strip() count = 0 for doc in qi:
import sys, json, requests, re import time #test from nltk.tokenize import PunktSentenceTokenizer from nltk.corpus import stopwords from WikiaSolr import QueryIterator, get_config, as_string, expand_entities, WikiaDomainLoader from WikiaSolr.StanfordParser import ParserService import codecs wid = sys.argv[1] qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'fields':'id,url,html_en' }) #qi = QueryIterator(get_config(), {'query': 'id:3125_199499', 'fields':'id,url,html_en' }) #qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'filterquery': 'views:[3500000 TO *]', 'fields':'id,url,html_en,views' }) #test service = ParserService() config = json.loads("".join(open('worker-config.json').readlines())) host = config["common"]["solr_endpoint"] entities = {} confirmed_entities = {} p = PunktSentenceTokenizer() doc_count = 0 bullet1 = '\xe2\x80\xa2'.decode('utf-8') bullet2 = '\xc2\xb7'.decode('utf-8') start_time = time.time()