Esempio n. 1
0
 def handle_noargs(self, **options):
     articles_of_interest = ArticleOfInterest.objects.all()
     for article in articles_of_interest:
         article_dict = query_text_rendered(article.title,
                                            language=article.title_language)
         # don't import articles we already have
         if SourceArticle.objects.filter(doc_id__exact='%s' % article_dict['revid'],
                                         language=article.title_language):
             continue
         try:
             source_article = SourceArticle(title=article.title,
                                            language=article.title_language,
                                            source_text=article_dict['html'],
                                            timestamp=datetime.now(),
                                            doc_id=article_dict['revid'])
             source_article.save()
             tr = TranslationRequest(article=source_article,
                                      target_language=article.target_language,
                                      date=datetime.now(),
                                      translator=DEFAULT_TRANNY)
             tr.save()
         except Exception as e:
             print type(e)
             print e.args
             try:
                 source_article.delete()
                 tr.delete()
             except:
                 pass
Esempio n. 2
0
def get_denominations(currencies):
    for currency in currencies:
        try:
            page = query_text_rendered(currency[1], "en")
            html = page['html']
            query = pq(html)
            ths = query(".infobox:first th")
            for th in ths.items():
                if "Banknotes" in th.text():
                    parent = th.parent()
                    td_text = parent("td:first").text()

                    if td_text != "":
                        denom_text = td_text
                    else:
                        next = parent.next()
                        denom_text = next("td").text()
                    tmp_denom = re.findall("\d+,?\d+", denom_text)
                    denominations = []
                    for denom in tmp_denom:
                        denom = denom.replace(',', "")
                        int_denom = int(denom)
                        if not int_denom:
                            denominations[-1] += denom
                        else:
                            denominations.append(denom)
                    if len(denominations):
                        currency.append(max([int(d) for d in denominations]))
                    else:
                        currency.append(None)

            print currency
        except KeyError:
            pass
Esempio n. 3
0
def get_sentences(page_title):
	all_sents = []
	txt = wikipydia.query_text_rendered(page_title)
	parse = BeautifulSoup(txt['html'])
	justtext = parse.get_text()
	#justtext = justtext.encode('utf-8')
	tok = nltk.tokenize.PunktSentenceTokenizer()
	sents0 = tok.tokenize(justtext)
	chunker = TagChunker(treebank_chunker())
	i = 0
	for s0 in sents0:
		i += 1
		sents = s0.split('\n')
		for s in sents:
			verbfound = False
			nounfound = False
			ss = s.split()
			if(len(ss) > 0):
				tree = chunker.parse(nltk.pos_tag(ss))
				for tag in [p[1] for p in tree.leaves()]:
					if(tag[0] == 'V'):
						verbfound = True
						break
				if(verbfound):
					for tag in [p[1] for p in tree.pos()]:
						if(tag == 'NP'):
							nounfound = True
							break
			if(verbfound and nounfound):
				all_sents.append(remove_hlinks(s))
	return all_sents
Esempio n. 4
0
 def handle_noargs(self, **options):
     articles_of_interest = ArticleOfInterest.objects.all()
     for article in articles_of_interest:
         article_dict = query_text_rendered(article.title,
                                            language=article.title_language)
         # don't import articles we already have
         if SourceArticle.objects.filter(doc_id__exact='%s' %
                                         article_dict['revid'],
                                         language=article.title_language):
             continue
         try:
             source_article = SourceArticle(
                 title=article.title,
                 language=article.title_language,
                 source_text=article_dict['html'],
                 timestamp=datetime.now(),
                 doc_id=article_dict['revid'])
             source_article.save()
             tr = TranslationRequest(
                 article=source_article,
                 target_language=article.target_language,
                 date=datetime.now(),
                 translator=DEFAULT_TRANNY)
             tr.save()
         except Exception as e:
             print type(e)
             print e.args
             try:
                 source_article.delete()
                 tr.delete()
             except:
                 pass
 def handle_noargs(self, **options):
     articles_of_interest = ArticleOfInterest.objects.all()
     for article in articles_of_interest:
         # don't import articles we already have
         if SourceArticle.objects.filter(title__exact='%s' % article.title,
                                         language=article.title_language):
             continue
         article_dict = query_text_rendered(
             article.title,
             language=article.title_language.code)
         try:
             source_article = SourceArticle(
                 title=article.title,
                 language=article.title_language,
                 source_text=article_dict['html'],
                 timestamp=datetime.now(),
                 doc_id=article_dict['revid']
                 )
             source_article.save()
         except Exception as e:
             print "Looks like we have an exception of type %s" % type(e)
             print "Exception args:", e.args
             try:
                 source_article.delete()
             except:
                 pass
def collect_wiki_corpus(language, lang, articles, splitters_folder):
	"""
	Download <n> random wikipedia articles in language <lang>
	"""
	filename = "%s%s.plain" % (splitters_folder,language)
	out = codecs.open(filename, "w", "utf-8")

	for title in articles:
		title=unquote(title)
		print ">> ",title
		print unquote(title)
		try:
			article_dict = wikipydia.query_text_rendered(title, language=lang)
			logging.debug("Training on: %s" % (unquote(title)))
			# Soup it
			soup = BeautifulSoup(article_dict['html'])
			p_text = ''
			for p in soup.findAll('p'):
				only_p = p.findAll(text=True)
				p_text = ''.join(only_p)

				# Tokenize but keep . at the end of words
				p_tokenized = ' '.join(PunktWordTokenizer().tokenize(p_text))

				out.write(p_tokenized)
				out.write("\n")
		except KeyError:
			logging.error("tokenizer training error")
		#except:
			#logging.error("some weird (JSON) error")
	out.close()
Esempio n. 7
0
def get_curencies():
    currencies = []
    page = query_text_rendered("List_of_circulating_currencies", "en")
    html = page['html']
    query = pq(html)
    table = query("table:first")
    trs = table("tr")
    for tr in trs:
        tds = tr.findall("td")
        if len(tds):
            iso_code =  tds[-3].text
            link = tds[-5].find("a")
            if link is not None and iso_code is not None:
                url = link.attrib['href'][6:]
                currency = [iso_code, url]
                if currency not in currencies:
                    currencies.append(currency)
    return currencies
Esempio n. 8
0
def debug_html(path):
	qterms = get_query_terms(path)
	#tmp = open('cntrl.tmp', 'w')
	k = qterms.keys()[1]
	txt = wikipydia.query_text_rendered(qterms[k])
	parse = BeautifulSoup(txt['html'])
	justtext = parse.get_text()
	#os.system("python html2text.py < cntrl.tmp > html.tmp")
	#os.remove('cntrl.tmp')
	html = ""
	#for line in open("html.tmp").readlines():
	#	html += line
	#sents = html.split("\\n")
	tok = nltk.tokenize.PunktSentenceTokenizer()
	sents = tok.tokenize(justtext)
	i = 0
	for s in range(0, 100):
		print sents[s]
	return
Esempio n. 9
0
def request_article(request,
                    form_class=ArticleRequestForm,
                    template_name="wt_articles/request_form.html"):
    if request.method == "POST":
        article_request_form = form_class(request.POST)
        if article_request_form.is_valid():
            title = article_request_form.cleaned_data['title']
            title_language = article_request_form. \
                             cleaned_data['title_language']
            if not ArticleOfInterest.objects.filter(
                title__exact=title, title_language__exact=title_language):
                article_of_interest = article_request_form.save(commit=False)
                article_of_interest.date = datetime.now()
                article_of_interest.save()
                article_dict = query_text_rendered(
                    title, language=title_language.code)
                request_form = form_class()
                return {"article_requested": True,
                        "request_form": request_form}
    else:
        request_form = form_class()
        return {"article_requested": False,
                "request_form": request_form}
Esempio n. 10
0
def get_sentences(page_title):
	all_sents = []
	#tmp = open('cntrl.tmp', 'w')
	txt = wikipydia.query_text_rendered(page_title)
	parse = BeautifulSoup(txt['html'])
	justtext = parse.get_text()
	#os.system("python html2text.py < cntrl.tmp > html.tmp")
	#os.remove('cntrl.tmp')
	html = ""
	#for line in open("html.tmp").readlines():
	#	html += line
	#sents = html.split("\\n")
	tok = nltk.tokenize.PunktSentenceTokenizer()
	sents = tok.tokenize(justtext)
	i = 0
	for s in sents:
		if(not(s == "")):
			all_sents.append(remove_hlinks(s))
		#s = s.strip()
		#ss = tok.tokenize(s)
		#for sss in ss:
		#	all_sents.append(remove_hlinks(sss))
		#i += 1
	return all_sents
Esempio n. 11
0
def collect_wiki_corpus(language, lang, num_items):
    """
    Download <n> random wikipedia articles in language <lang>
    """
    filename = "%s.plain" % (language)
    out = codecs.open(filename, "w", "utf-8")

    for title in query_random_titles(lang, num_items):
        article_dict = query_text_rendered(title, language=lang)

        # Soup it
        soup = BeautifulSoup(article_dict['html'])
        p_text = ''
        for p in soup.findAll('p'):
            only_p = p.findAll(text=True)
            p_text = ''.join(only_p)

            # Tokenize but keep . at the end of words
            p_tokenized = ' '.join(PunktWordTokenizer().tokenize(p_text))

            out.write(p_tokenized)
            out.write("\n")

    out.close()
Esempio n. 12
0
def request_translation(request, form_class=TranslationRequestForm, template_name="wt_articles/request_form.html", deletedId= -1, deleteAll = False, update = False):
    """
    deletedId in this context is the deleted article id
    """
    #Update
    if(update):
        from wikipydia import query_text_rendered, query_text_raw        
        from wt_articles import DEFAULT_TRANNY    
        if request.POST:
            post = request.POST.copy()
            user_form = UserForm(post, instance=request.user)
            if user_form.is_valid():
                user_form.save()
                response = redirect('/accounts/' + request.user.username)
        else:
            articles_of_interest = ArticleOfInterest.objects.all()        
            for article in articles_of_interest:            
                if SourceArticle.objects.filter(title=article.title, language=article.title_language):
                    continue
                #article_dict = query_text_raw(article.title,
                #                                   language=article.title_language)                                                                        
                article_dict = query_text_rendered(article.title,
                                               language=article.title_language)
                print(article.title, article.title_language)                       
                try:
                    source_article = SourceArticle(title=article.title,
                                               language=article.title_language,
                                               #source_text=article_dict['text'],
                                               source_text=article_dict['html'],
                                               timestamp=datetime.now(),
                                               doc_id=article_dict['revid'])
                    source_article.save()
                    tr = TranslationRequest(article=source_article,
                                         target_language=article.target_language,
                                         date=datetime.now(),
                                         translator=DEFAULT_TRANNY)
                    tr.save()                                    
                except Exception as e:
                    print type(e)
                    print e.args
                    try:
                        source_article.delete()
                        tr.delete()
                    except:
                        pass         
 ###Delete             
    if(deletedId != -1):
        article = ArticleOfInterest.objects.filter(id=deletedId)
        article.delete()
    if(deleteAll):
       ArticleOfInterest.objects.all().delete()        
    if request.method == "POST":
        request_form = form_class(request.POST)
        if request_form.is_valid():
            title = request_form.cleaned_data['title']
            title_language = request_form.cleaned_data['title_language']
            target_language = request_form.cleaned_data['target_language']
            exists = ArticleOfInterest.objects.filter(title__exact=title,
                                                      title_language__exact=title_language,
                                                      target_language__exact=target_language)
            if len(exists) < 1:
                translation_request = request_form.save(commit=False)
                translation_request.date = datetime.now()
                translation_request.save()
            #return render_to_response("wt_articles/requests_thankyou.html", {},
            #                          context_instance=RequestContext(request))
    else:
        request_form = form_class()
     
    articles = all_articles_of_interest()
    return render_to_response(template_name, {
        "request_form": request_form,
        "articles": articles,
    }, context_instance=RequestContext(request))