Esempio n. 1
0
def write_text(wid):
    try:
        last_indexed_value = open(os.path.join(
            DATA_DIR, 'last_indexed.txt')).read().strip()
    except IOError:
        last_indexed_value = '2000-01-01T12:00:00.000Z'
    text_dir = os.path.join(DATA_DIR, 'text', str(wid))
    query = 'wid:%s AND iscontent:true' % str(wid)
    if last_indexed:
        query += ' AND indexed:["%s" TO *]' % last_indexed_value
    qi = QueryIterator(get_config(), {
        'query': query,
        'fields': 'pageid, html_en, indexed',
        'sort': 'pageid asc'
    })
    print 'Writing text from %s to file...' % str(wid)
    for doc in qi:
        pageid = doc['pageid']
        text = '\n'.join(clean_list(doc.get('html_%s' % language, '')))
        last_indexed_value = max(last_indexed_value, doc.get('indexed'))
        text_subdir = os.path.join(text_dir, str(pageid)[0])
        if not os.path.exists(text_subdir):
            os.makedirs(text_subdir)
        text_filepath = os.path.join(text_subdir, str(pageid))
        with open(text_filepath, 'w') as text_file:
            text_file.write(text)
    with open(os.path.join(DATA_DIR, 'last_indexed.txt'),
              'w') as last_indexed_file:
        last_indexed_file.write(last_indexed_value)
    return text_dir
Esempio n. 2
0
def write_files(wid):
    filepath = os.path.join('html_en', str(wid))
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'pageid, html_en', 'sort': 'id asc'})
    for doc in qi:
        print 'extracting words for %i_%i...' % (wid, doc['pageid'])
        page_file = GzipFile(os.path.join(filepath, '%s.gz' % doc['pageid']), 'w', compresslevel=9)
        page_file.write(doc.get('html_en', '').lower().encode('utf-8'))
        page_file.close()
Esempio n. 3
0
def write_files(wid):
    filepath = os.path.join('html_en', str(wid))
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    qi = QueryIterator(
        get_config(), {
            'query': 'wid:%s AND iscontent:true' % str(wid),
            'fields': 'pageid, html_en',
            'sort': 'id asc'
        })
    for doc in qi:
        print 'extracting words for %i_%i...' % (wid, doc['pageid'])
        page_file = GzipFile(os.path.join(filepath, '%s.gz' % doc['pageid']),
                             'w',
                             compresslevel=9)
        page_file.write(doc.get('html_en', '').lower().encode('utf-8'))
        page_file.close()
Esempio n. 4
0
 def extract_words(self, wid):
     """Updates db with previously unseen words and lemmas, and page unigrams"""
     words_file = gzip.open(self.words_file, 'a')
     page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w')
     w = WordPunctTokenizer()
     qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'})
     print 'starting extraction for wiki %s...' % str(wid)
     for doc in qi:
         print 'extracting words for %s...' % doc['id']
         page_file.write('\t%s\n' % doc['pageid'])
         for word in w.tokenize(doc.get('html_en', '').lower()):
             if word not in self.words:
                 self.words[word] = self.counter
                 words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8')))
                 self.counter += 1
             page_file.write('%i\n' % self.words.get(word, 0))
     page_file.close()
     words_file.close()
    def write_batch_files(self):
        qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % self.wid, 'fields':'id,url,html_en', 'sort': 'id asc'})

        doc_count = 0
        batch_count = 0

        for doc in qi:
            if doc_count % 100 == 0:
                batch_count += 1
                filepath = os.path.join(self.tempdir, str(batch_count))
                if not os.path.exists(filepath):
                    os.makedirs(filepath)
            text = normalize(as_string(doc.get('html_en', '')))
            if text != '':
                print 'writing %s to %s' % (doc['id'], filepath)
                output_file = open(os.path.join(filepath, doc['id']), 'w')
                output_file.write(text.encode('utf-8'))
                output_file.close()
            doc_count += 1
def write_batch_files():
    qi = QueryIterator(get_config(), {'query': "wid:298117 AND -title_en:\"File:IGN Weekly 'Wood\" is_video:true", 'fields': 'id,title_en,video_description_txt,video_keywords_txt,video_actors_txt,video_tags_txt,video_genres_txt'})

    doc_count = 0
    batch_count = 0

    #tempdir = tempfile.mkdtemp()
    tempdir = '/home/tristan/temp/video'

    for doc in qi:
        if doc_count % 100 == 0:
            batch_count += 1
            filepath = os.path.join(tempdir, str(batch_count))
            if not os.path.exists(filepath):
                os.makedirs(filepath)
        #no_parse = []
        #for field in [u'video_keywords_txt', u'video_tags_txt', u'video_actors_txt', u'video_genres_txt']:
        #    for tag in doc.get(field, []):
        #        no_parse.append(tag)
        #text = '\t'.join(list(set(no_parse))) + '\n'
        #for field in [u'title_en', u'video_description_txt']:
        #    val = doc.get(field, None)
        #    if val:
        #        text += as_string(val)
        fields = []
        for field in doc:
            if field != u'id':
                val = doc.get(field, None)
                if val:
                    fields.append(as_string(doc[field]))
        text = '.\n'.join(fields)
        print text
        output_file = open(os.path.join(filepath, doc[u'id']), 'w')
        output_file.write(text.encode('utf-8'))
        output_file.close()
        doc_count += 1
    return tempdir
Esempio n. 7
0
    def write_batch_files(self):
        qi = QueryIterator(
            get_config(), {
                'query': 'wid:%s AND iscontent:true' % self.wid,
                'fields': 'id,url,html_en',
                'sort': 'id asc'
            })

        doc_count = 0
        batch_count = 0

        for doc in qi:
            if doc_count % 100 == 0:
                batch_count += 1
                filepath = os.path.join(self.tempdir, str(batch_count))
                if not os.path.exists(filepath):
                    os.makedirs(filepath)
            text = normalize(as_string(doc.get('html_en', '')))
            if text != '':
                print 'writing %s to %s' % (doc['id'], filepath)
                output_file = open(os.path.join(filepath, doc['id']), 'w')
                output_file.write(text.encode('utf-8'))
                output_file.close()
            doc_count += 1
Esempio n. 8
0
def write_text(wid):
    try:
        last_indexed_value = open(os.path.join(DATA_DIR, 'last_indexed.txt')).read().strip()
    except IOError:
        last_indexed_value = '2000-01-01T12:00:00.000Z'
    text_dir = os.path.join(DATA_DIR, 'text', str(wid))
    query = 'wid:%s AND iscontent:true' % str(wid)
    if last_indexed:
        query += ' AND indexed:["%s" TO *]' % last_indexed_value
    qi = QueryIterator(get_config(), {'query': query, 'fields': 'pageid, html_en, indexed', 'sort': 'pageid asc'})
    print 'Writing text from %s to file...' % str(wid)
    for doc in qi:
        pageid = doc['pageid']
        text = '\n'.join(clean_list(doc.get('html_%s' % language, '')))
        last_indexed_value = max(last_indexed_value, doc.get('indexed'))
        text_subdir = os.path.join(text_dir, str(pageid)[0])
        if not os.path.exists(text_subdir):
            os.makedirs(text_subdir)
        text_filepath = os.path.join(text_subdir, str(pageid))
        with open(text_filepath, 'w') as text_file:
            text_file.write(text)
    with open(os.path.join(DATA_DIR, 'last_indexed.txt'), 'w') as last_indexed_file:
        last_indexed_file.write(last_indexed_value)
    return text_dir
import sys, json, requests, re
import time  #test
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from WikiaSolr import QueryIterator, get_config, as_string, expand_entities, WikiaDomainLoader
from WikiaSolr.StanfordParser import ParserService
import codecs

wid = sys.argv[1]

qi = QueryIterator(get_config(), {
    'query': 'wid:%s AND iscontent:true' % wid,
    'fields': 'id,url,html_en'
})
#qi = QueryIterator(get_config(), {'query': 'id:3125_199499', 'fields':'id,url,html_en' })
#qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'filterquery': 'views:[3500000 TO *]', 'fields':'id,url,html_en,views' }) #test

service = ParserService()

config = json.loads("".join(open('worker-config.json').readlines()))
host = config["common"]["solr_endpoint"]

entities = {}
confirmed_entities = {}

p = PunktSentenceTokenizer()

doc_count = 0

bullet1 = '\xe2\x80\xa2'.decode('utf-8')
bullet2 = '\xc2\xb7'.decode('utf-8')
Esempio n. 10
0
from nltk.tokenize.punkt import PunktSentenceTokenizer
from WikiaSolr import QueryIterator, get_config
from normalize import not_infobox
from time import time

start_time = time()

wid = sys.argv[1]
TOP_N = 5

def remove_newlines(text):
    if '\n' in text:
        return ' '.join([line for line in text.split('\n') if '.' in line])
    return text

qi = QueryIterator(get_config(), {'query': 'wid:%s AND confirmed_entities_txt:*' % wid, 'fields': 'id,url,html_en,confirmed_entities_txt'})

for doc in qi:
    entity_tally = defaultdict(int)
    confirmed_entities = [entity.lower() for entity in doc.get('confirmed_entities_txt', [])]
    html = urllib2.urlopen(doc['url']).read()
    soup = BeautifulSoup(html)
    text = ' '.join([p.text for p in soup.find_all('p')])
    sentences = filter(not_infobox, [remove_newlines(sentence) for sentence in PunktSentenceTokenizer().tokenize(text)])
    for (i, sentence) in enumerate(sentences):
        lowercase = sentence.lower()
        for entity in confirmed_entities:
            if entity in lowercase:
                entity_tally[i] += 1
    if not entity_tally:
        summary = ' '.join(sentences[:TOP_N]).encode('utf-8')
Esempio n. 11
0
from time import time

start_time = time()

wid = sys.argv[1]
TOP_N = 5


def remove_newlines(text):
    if '\n' in text:
        return ' '.join([line for line in text.split('\n') if '.' in line])
    return text


qi = QueryIterator(
    get_config(), {
        'query': 'wid:%s AND confirmed_entities_txt:*' % wid,
        'fields': 'id,url,html_en,confirmed_entities_txt'
    })

for doc in qi:
    entity_tally = defaultdict(int)
    confirmed_entities = [
        entity.lower() for entity in doc.get('confirmed_entities_txt', [])
    ]
    html = urllib2.urlopen(doc['url']).read()
    soup = BeautifulSoup(html)
    text = ' '.join([p.text for p in soup.find_all('p')])
    sentences = filter(not_infobox, [
        remove_newlines(sentence)
        for sentence in PunktSentenceTokenizer().tokenize(text)
Esempio n. 12
0
import sys, re, json, requests
from operator import itemgetter
from WikiaSolr import QueryIterator, get_config

wid = sys.argv[1]

config = json.loads(''.join(open('worker-config.json').readlines()))
host = config['common']['solr_endpoint']

qi = QueryIterator(get_config(), {'query': 'wid:%s AND suspected_entities_txt:*' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt'})
#qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'fields': 'id,suspected_entities_txt,confirmed_entities_txt', 'filterquery': 'views:[2000000 TO *]'})

suspected_entities = {}
confirmed_entities = {}

def hasalpha(string):
    import string as s
    if string:
        for letter in s.lowercase:
            if letter in string:
                return True
    return False

def normalize(string):
    string = re.sub(u'[^\w\s]|_', u' ', string.lower())
    string = re.sub(u' {2,}', u' ', string)
    return string.strip()

count = 0

for doc in qi:
import sys, json, requests, re
import time #test
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from WikiaSolr import QueryIterator, get_config, as_string, expand_entities, WikiaDomainLoader
from WikiaSolr.StanfordParser import ParserService
import codecs

wid = sys.argv[1]

qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'fields':'id,url,html_en' })
#qi = QueryIterator(get_config(), {'query': 'id:3125_199499', 'fields':'id,url,html_en' })
#qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % wid, 'filterquery': 'views:[3500000 TO *]', 'fields':'id,url,html_en,views' }) #test

service = ParserService()

config = json.loads("".join(open('worker-config.json').readlines()))
host = config["common"]["solr_endpoint"]

entities = {}
confirmed_entities = {}

p = PunktSentenceTokenizer()

doc_count = 0

bullet1 = '\xe2\x80\xa2'.decode('utf-8')
bullet2 = '\xc2\xb7'.decode('utf-8')

start_time = time.time()