コード例 #1
0
 def handle_noargs(self, **options):
     articles_of_interest = ArticleOfInterest.objects.all()
     for article in articles_of_interest:
         article_dict = query_text_raw(article.title,
                                       language=article.title_language)
         # don't import articles we already have
         if SourceArticle.objects.filter(doc_id__exact='%s' % article_dict['revid'],
                                         language=article.title_language):
             continue
         try:
             source_article = SourceArticle(title=article.title,
                                            language=article.title_language,
                                            source_text=article_dict['text'],
                                            timestamp=datetime.now(),
                                            doc_id=article_dict['revid'])
             source_article.save()
             tr = TranslationRequest(article=source_article,
                                      target_language=article.target_language,
                                      date=datetime.now(),
                                      translator=DEFAULT_TRANNY)
             tr.save()
         except Exception as e:
             print type(e)
             print e.args
             try:
                 source_article.delete()
                 tr.delete()
             except:
                 pass
コード例 #2
0
ファイル: wiki.py プロジェクト: abstract-generator/ag-shared
 def get_wiki(self, query, lang):
     try:
         title = wikipydia.opensearch(query, language=lang)[1][0].encode('utf8')
         text = wikipydia.query_text_raw(title, language=lang)
         return title, text['text'].encode('utf8')
     except IndexError:
         pass
def get_sentences_for_article(article, article_id, lang, sentence_filename, write_to_file=True):
   """
   Converts the article to text, splits it into sentences.  
   Appends the sentences to file
   """
   wikimarkup = wikipydia.query_text_raw(article, lang)['text']
   sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True)
   if(write_to_file):
      sentences = write_lines_to_file(sentence_filename, sentences)
      tags = write_lines_to_file(sentence_filename + '.tags', tags)
      seg_ids = []
      for i in range(0, len(sentences)):
         id = article_id + '_' + str(i)
         seg_ids.append(id)
      seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids)
   return sentences
コード例 #4
0
def main():
    optParse(
        trace__T=None,
        language__L='|'.join(l for p in languages for l in p),
        fromDump__D='',
        showType__S=None,
        withTags__W=None
        )

    sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize


    if options.fromDump:
        if options.fromDump.endswith('.gz'):
            source = os.popen('zcat %s' % options.fromDump)
        else:
            source = open(options.fromDump)
        currentLines = []
        for line in source:
            line = line.strip()
            if line.startswith('<title>'):
                print line
            elif line.startswith('<text'):
                currentLines.append(line.split('>',1)[1])
            elif currentLines:
                if line.endswith('</text>'):
                    currentLines.append(line.rsplit('<',1)[0])
                    print '\n'.join(wiki2sentences('\n'.join(currentLines),
                                                   sent_detector,False))
                    currentLines = []
                else:
                    currentLines.append(line)
            

    else:
        for title in arguments:
            if title == 'Barack Obama' and options.language=='en':
                text = open('obama.src').read().decode('utf-8')
            else:
                text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text']
            if options.withTags:
                for s,t in zip(*wiki2sentences(text,sent_detector,True)):
                    print t[:4],s.encode('utf-8')
            else:
                print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')
def get_sentences_for_article(article,
                              article_id,
                              lang,
                              sentence_filename,
                              write_to_file=True):
    """
   Converts the article to text, splits it into sentences.  
   Appends the sentences to file
   """
    wikimarkup = wikipydia.query_text_raw(article, lang)['text']
    sentences, tags = wpTextExtractor.wiki2sentences(wikimarkup,
                                                     determine_splitter(lang),
                                                     True)
    if (write_to_file):
        sentences = write_lines_to_file(sentence_filename, sentences)
        tags = write_lines_to_file(sentence_filename + '.tags', tags)
        seg_ids = []
        for i in range(0, len(sentences)):
            id = article_id + '_' + str(i)
            seg_ids.append(id)
        seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids)
    return sentences
コード例 #6
0
import datetime
import wpTextExtractor

import sys

date = datetime.date(int((sys.argv[2])[:4]), int((sys.argv[2])[5:7]), int((sys.argv[2])[8:10]))

negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3]))

for article in negatives:
    print article.replace("_", " "), '\t', 
    print '-1', '\t',
    first_sentence = ''
    paragraph = ''
    text = query_text_raw(article,sys.argv[1])['text']
    sentences, tags = wpTextExtractor.wiki2sentences(text, determine_splitter(sys.argv[1]), True)
    for sent, tag in zip(sentences, tags):
        if first_sentence == '':
            first_sentence = '1'
            print sent.encode('utf-8').rstrip(), '\t',
        print sent.encode('utf-8'),
        if tag == "LastSentence":
            break

    print ""


#negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3]))

コード例 #7
0
def get_lang_links_context(lang_links, lang, max_items=settings["top_links"], num_context_sentences=settings["num_context_sentences"]):
#(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]):
	#build vocabulary based on list of articles and compile context sentences for each word
	"""
	Extracts all of the non-English vocabulary from each of the pages, and retains
	up to the specified number of context sentences.  The vocab is normaized by 
	lowercasing and stripping punctuation.
	"""
	logging.info("getting context for interlanguage links")
	
	#add all unicode punctuation categories for exclusion
	all_chars=(unichr(i) for i in xrange(0x10000))
	punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P')
	#punct_to_exclude= set(string.punctuation + "1234567890")
	punct_to_exclude= set(punct + "1234567890")	

	links_with_context={}

	splitter=determine_splitter(lang)

	for i,en_article in enumerate(lang_links):
		logging.debug ("item # %s from %s, # of good links %s, # of links needed %s" % (i, en_article,len(links_with_context), max_items))

		if len(links_with_context) >= max_items:
			break

		
		article = lang_links[en_article]["translation"]
		
		if use_as_gold_standard_translation(en_article, article, lang):
			logging.debug("link accepted %s - %s" % (en_article,article))

			word = unicode(article, "UTF-8")
			try:
				wikimarkup = wikipydia.query_text_raw(article, lang)['text']
				sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True)
				
				for j,sentence in enumerate(sentences):
					if re.search(word, sentence):
						if not word in links_with_context:
							links_with_context[word] = {"context":[],"translation":en_article}
						
						if len(links_with_context[word]["context"]) < num_context_sentences:
							links_with_context[word]["context"].append(sentence)
							links_with_context[word]["translation"] = en_article
						else:
							break
			except KeyError:
				#logging.debug( u'no page for %s %s' % (article, lang))
				print u'no page for ', article, lang
			except IOError:
				logging.debug( u'cannot reach %s %s' % (article, lang))
			except TypeError:
				#logging.debug( u'unicode object error for %s %s' % (article, lang))
				print 'unicode object error for', article, lang
			except UnicodeDecodeError:
				#logging.debug( u'unicode error for %s %s' % (article, lang))
				print u'unicode error ', article, lang
			except:
				#logging.debug( u'somethign weird happened for %s %s' % (article, lang))
				print u'somethign weird happened for  ', article, lang
		else:
			logging.debug("link rejected %s - %s" % (en_article,article))
			

	return links_with_context
コード例 #8
0
def get_vocab(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]):
	#build vocabulary based on list of articles and compile context sentences for each word

	logging.info("generating vocabulary")
	#add all unicode punctuation categories for exclusion
	all_chars=(unichr(i) for i in xrange(0x10000))
	punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P')
	#punct_to_exclude= set(string.punctuation + "1234567890")
	punct_to_exclude= set(punct + "1234567890")	

	vocab={}
	num_articles=0

	splitter=determine_splitter(lang)
	
	for i,article in enumerate(articles):
		try:
			wikimarkup = wikipydia.query_text_raw(articles[i], lang)['text']
			sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True)

			for sentence in sentences:
				sent = ''.join(ch for ch in sentence if ch not in punct_to_exclude)
				sent = sent.lower()
				words = sent.split(' ')

				for word in words:
				# filter words that are obviously non foreighn language (plain english or gibberish/non-alpha)
				#if not word in en_vocab:

					if len(word)<settings["min_letters"]:
						break

					if not word in vocab:
						vocab[word] = {"frequency":1,"context":[]}
					else:
						vocab[word]["frequency"]=vocab[word]["frequency"]+1
					if len(vocab[word]["context"]) < num_context_sentences:
						vocab[word]["context"].append(sentence)

			num_articles = num_articles + 1
			if num_articles >= max_articles:
				break

		except KeyError:
			#logging.debug( u'no page for %s %s' % (article, lang))
			print u'no page for ', article, lang
		except IOError:
			#logging.debug( u'cannot reach %s %s' % (article, lang))
			print u'cannot reach ', article, lang
		except TypeError:
			#logging.debug( u'unicode object error for %s %s' % (article, lang))
			print u'unicode object error for ', article, lang
		except UnicodeDecodeError:
			#logging.debug( u'unicode error for %s %s' % (article, lang))
			print u'unicode error for ', article, lang
		except:
			#logging.debug( u'somethign weird happened for %s %s' % (article, lang))
			print u'somethign weird happened for ', article, lang

	logging.info("vocabulary size: %s" % (len(vocab)))
	return vocab