Exemple #1
0
    def save(self):
        sentences = list()
        segment_id = 0
        #soup = BeautifulSoup(self.source_text)
        sentence_splitter = determine_splitter(self.language)
        # initial save for foriegn key based saves to work
        # save should occur after sent_detector is loaded
        super(SourceArticle, self).save()
        #for p in soup.findAll('p'):
        #    only_p = p.findAll(text=True)
        #    p_text = ''.join(only_p)
        #    for sentence in sentence_splitter(p_text.strip()):
        #        s = SourceSentence(article=self, text=sentence, segment_id=segment_id)
        #        segment_id += 1
        #        s.save()
        #    s.end_of_paragraph = True
        import sys
        print >> sys.stderr, 'got here...'
        for sent,tag in zip(*wiki2sentences(self.source_text,sentence_splitter)):
            s = SourceSentence(article=self, text=sent, segment_id=segment_id)
            segment_id += 1
            if tag=='LastSentence':
                s.end_of_paragraph = True
            s.save()

        self.sentences_processed = True
        super(SourceArticle, self).save()
def main():
    optParse(
        trace__T=None,
        language__L='|'.join(l for p in languages for l in p),
        fromDump__D='',
        showType__S=None,
        withTags__W=None
        )

    sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize


    if options.fromDump:
        if options.fromDump.endswith('.gz'):
            source = os.popen('zcat %s' % options.fromDump)
        else:
            source = open(options.fromDump)
        currentLines = []
        for line in source:
            line = line.strip()
            if line.startswith('<title>'):
                print line
            elif line.startswith('<text'):
                currentLines.append(line.split('>',1)[1])
            elif currentLines:
                if line.endswith('</text>'):
                    currentLines.append(line.rsplit('<',1)[0])
                    print '\n'.join(wiki2sentences('\n'.join(currentLines),
                                                   sent_detector,False))
                    currentLines = []
                else:
                    currentLines.append(line)
            

    else:
        for title in arguments:
            if title == 'Barack Obama' and options.language=='en':
                text = open('obama.src').read().decode('utf-8')
            else:
                text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
            if options.withTags:
                for s,t in zip(*wiki2sentences(text,sent_detector,True)):
                    print t[:4],s.encode('utf-8')
            else:
                print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')
def get_sentences_for_article(article, article_id, lang, sentence_filename, write_to_file=True):
   """
   Converts the article to text, splits it into sentences.  
   Appends the sentences to file
   """
   wikimarkup = wikipydia.query_text_raw(article, lang)['text']
   sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True)
   if(write_to_file):
      sentences = write_lines_to_file(sentence_filename, sentences)
      tags = write_lines_to_file(sentence_filename + '.tags', tags)
      seg_ids = []
      for i in range(0, len(sentences)):
         id = article_id + '_' + str(i)
         seg_ids.append(id)
      seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids)
   return sentences
def get_sentences_for_article(article,
                              article_id,
                              lang,
                              sentence_filename,
                              write_to_file=True):
    """
   Converts the article to text, splits it into sentences.  
   Appends the sentences to file
   """
    wikimarkup = wikipydia.query_text_raw(article, lang)['text']
    sentences, tags = wpTextExtractor.wiki2sentences(wikimarkup,
                                                     determine_splitter(lang),
                                                     True)
    if (write_to_file):
        sentences = write_lines_to_file(sentence_filename, sentences)
        tags = write_lines_to_file(sentence_filename + '.tags', tags)
        seg_ids = []
        for i in range(0, len(sentences)):
            id = article_id + '_' + str(i)
            seg_ids.append(id)
        seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids)
    return sentences
Exemple #5
0
def write_lines_to_file(output_filename, lines):
	"""                                                                                                                     \
	Writes a list of lines to file.                                                                                       \
	"""
	output_file = open(output_filename, 'w')
	for line in lines:
		output_file.write(line.encode('UTF-8'))
		output_file.write('\n'.encode('UTF-8'))
	output_file.close()
	return lines

#topics = read_lines_from_file('/Users/bahn/work/wikitopics/data/clustering/pick/pick0127')
date = datetime.date(2009, 10, 12)
lang = 'en'

sentences,tags = wpTextExtractor.wiki2sentences("<!-- See  -->\n<!-- PLEASE DO NOT CHANGE OBAMA'S NAME -->", determine_splitter(lang), True)
for s in sentences:
	print s
sys.exit(0)

#topics = ['Inauguration_of_Barack_Obama', 'Bill_Clinton', 'Black_Saturday_bushfires', 'Estradiol','Emma_Frost','Influenza','James','Brett_Favre']
topics = ['Barack_Obama']
shown = {}
shown2 = {}
shown3 = {}
for article in topics:
	revid = wikipydia.query_revid_by_date(article, lang, date)
	print revid
	wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text']
	sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True)
	wikimarkup = '\n'.join(sentences)
import datetime
import wpTextExtractor

import sys

date = datetime.date(int((sys.argv[2])[:4]), int((sys.argv[2])[5:7]), int((sys.argv[2])[8:10]))

negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3]))

for article in negatives:
    print article.replace("_", " "), '\t', 
    print '-1', '\t',
    first_sentence = ''
    paragraph = ''
    text = query_text_raw(article,sys.argv[1])['text']
    sentences, tags = wpTextExtractor.wiki2sentences(text, determine_splitter(sys.argv[1]), True)
    for sent, tag in zip(sentences, tags):
        if first_sentence == '':
            first_sentence = '1'
            print sent.encode('utf-8').rstrip(), '\t',
        print sent.encode('utf-8'),
        if tag == "LastSentence":
            break

    print ""


#negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3]))


#for article in negatives:
def get_lang_links_context(lang_links, lang, max_items=settings["top_links"], num_context_sentences=settings["num_context_sentences"]):
#(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]):
	#build vocabulary based on list of articles and compile context sentences for each word
	"""
	Extracts all of the non-English vocabulary from each of the pages, and retains
	up to the specified number of context sentences.  The vocab is normaized by 
	lowercasing and stripping punctuation.
	"""
	logging.info("getting context for interlanguage links")
	
	#add all unicode punctuation categories for exclusion
	all_chars=(unichr(i) for i in xrange(0x10000))
	punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P')
	#punct_to_exclude= set(string.punctuation + "1234567890")
	punct_to_exclude= set(punct + "1234567890")	

	links_with_context={}

	splitter=determine_splitter(lang)

	for i,en_article in enumerate(lang_links):
		logging.debug ("item # %s from %s, # of good links %s, # of links needed %s" % (i, en_article,len(links_with_context), max_items))

		if len(links_with_context) >= max_items:
			break

		
		article = lang_links[en_article]["translation"]
		
		if use_as_gold_standard_translation(en_article, article, lang):
			logging.debug("link accepted %s - %s" % (en_article,article))

			word = unicode(article, "UTF-8")
			try:
				wikimarkup = wikipydia.query_text_raw(article, lang)['text']
				sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True)
				
				for j,sentence in enumerate(sentences):
					if re.search(word, sentence):
						if not word in links_with_context:
							links_with_context[word] = {"context":[],"translation":en_article}
						
						if len(links_with_context[word]["context"]) < num_context_sentences:
							links_with_context[word]["context"].append(sentence)
							links_with_context[word]["translation"] = en_article
						else:
							break
			except KeyError:
				#logging.debug( u'no page for %s %s' % (article, lang))
				print u'no page for ', article, lang
			except IOError:
				logging.debug( u'cannot reach %s %s' % (article, lang))
			except TypeError:
				#logging.debug( u'unicode object error for %s %s' % (article, lang))
				print 'unicode object error for', article, lang
			except UnicodeDecodeError:
				#logging.debug( u'unicode error for %s %s' % (article, lang))
				print u'unicode error ', article, lang
			except:
				#logging.debug( u'somethign weird happened for %s %s' % (article, lang))
				print u'somethign weird happened for  ', article, lang
		else:
			logging.debug("link rejected %s - %s" % (en_article,article))
			

	return links_with_context
def get_vocab(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]):
	#build vocabulary based on list of articles and compile context sentences for each word

	logging.info("generating vocabulary")
	#add all unicode punctuation categories for exclusion
	all_chars=(unichr(i) for i in xrange(0x10000))
	punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P')
	#punct_to_exclude= set(string.punctuation + "1234567890")
	punct_to_exclude= set(punct + "1234567890")	

	vocab={}
	num_articles=0

	splitter=determine_splitter(lang)
	
	for i,article in enumerate(articles):
		try:
			wikimarkup = wikipydia.query_text_raw(articles[i], lang)['text']
			sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True)

			for sentence in sentences:
				sent = ''.join(ch for ch in sentence if ch not in punct_to_exclude)
				sent = sent.lower()
				words = sent.split(' ')

				for word in words:
				# filter words that are obviously non foreighn language (plain english or gibberish/non-alpha)
				#if not word in en_vocab:

					if len(word)<settings["min_letters"]:
						break

					if not word in vocab:
						vocab[word] = {"frequency":1,"context":[]}
					else:
						vocab[word]["frequency"]=vocab[word]["frequency"]+1
					if len(vocab[word]["context"]) < num_context_sentences:
						vocab[word]["context"].append(sentence)

			num_articles = num_articles + 1
			if num_articles >= max_articles:
				break

		except KeyError:
			#logging.debug( u'no page for %s %s' % (article, lang))
			print u'no page for ', article, lang
		except IOError:
			#logging.debug( u'cannot reach %s %s' % (article, lang))
			print u'cannot reach ', article, lang
		except TypeError:
			#logging.debug( u'unicode object error for %s %s' % (article, lang))
			print u'unicode object error for ', article, lang
		except UnicodeDecodeError:
			#logging.debug( u'unicode error for %s %s' % (article, lang))
			print u'unicode error for ', article, lang
		except:
			#logging.debug( u'somethign weird happened for %s %s' % (article, lang))
			print u'somethign weird happened for ', article, lang

	logging.info("vocabulary size: %s" % (len(vocab)))
	return vocab
Exemple #9
0
def fetch_articles_on_date(topics, date, lang, output_dir, upperlimit, dryrun, retry=5, wait=5):
	if os.path.exists(output_dir):
		if not os.path.isdir(output_dir):
			sys.stderr.write(output_dir + " is not a directory\n")
			sys.exit(1)
	else:
		os.makedirs(output_dir)

	mark = {}
	success = 0
	articles = {}
	mark = {}
	for article, values in topics.items():
		if success >= upperlimit:
			break
		title = article

		# resolve redirects
		if not wikipydia.query_exists(title, lang):
			continue
		title = wikipydia.query_redirects(title, lang).replace(' ','_')

		if title in mark:
			continue
		mark[title] = True

		# the file prefix for output files
		file_prefix = urllib.quote(title.replace(' ','_').encode('utf8'), safe="%") # force / to be quoted and % not to be quoted
		if file_prefix.startswith('.'):
			file_prefix = "%2E" + file_prefix[1:]

		if dryrun:
			print file_prefix
			success += 1
			continue

		done = False
		no_retry = 0
		while not done and no_retry < retry:
			try:
				revid = values['thenid']
				if revid == 0:
					revid = wikipydia.query_revid_by_date_fallback(title, lang, date)
				wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text']
				done = True
			except:
				no_retry += 1
				time.sleep(wait)

		if not wikimarkup:
			print 'Retrieving', title, 'failed'
			print 'RevID:', revid
			print 'Date:', date.isoformat()
			continue
		try:
			sentences, tags, citations = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True, True)
		except:
			sys.stdout.flush()
			sys.stdout.write('Failed retrieving the text from ' + title + '\n')
			traceback.print_exc()
			sys.stdout.flush()
			continue

		# substitute angle brackets with html-like character encodings
		#sentences = [re.sub('<', '&lt;', re.sub('>', '&gt;', s)) for s in sentences]
		#sentences.insert(0, urllib.unquote(file_prefix.replace('_',' ')) + '.')
		output_filename = os.path.join(output_dir, file_prefix + '.sentences')
		output = write_lines_to_file(output_filename, sentences)
		output_filename = os.path.join(output_dir, file_prefix + '.tags')
		output = write_lines_to_file(output_filename, tags)
		success += 1

		priorid = values['priorid']
		if priorid == 0:
			priorid = wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15))
		articles[title] = {'score': values['score'], 'thenid': revid, 'priorid': priorid}
		sys.stderr.write('.')
	sys.stderr.write('\n')

	if not dryrun:
		if len(articles) > 1 or (len(articles) == 1 and output_dir != '.'):
			write_articles(articles, topics, os.path.join(output_dir, date.strftime('%Y-%m-%d') + '.articles.list'))