def handle_noargs(self, **options): articles_of_interest = ArticleOfInterest.objects.all() for article in articles_of_interest: article_dict = query_text_rendered(article.title, language=article.title_language) # don't import articles we already have if SourceArticle.objects.filter(doc_id__exact='%s' % article_dict['revid'], language=article.title_language): continue try: source_article = SourceArticle(title=article.title, language=article.title_language, source_text=article_dict['html'], timestamp=datetime.now(), doc_id=article_dict['revid']) source_article.save() tr = TranslationRequest(article=source_article, target_language=article.target_language, date=datetime.now(), translator=DEFAULT_TRANNY) tr.save() except Exception as e: print type(e) print e.args try: source_article.delete() tr.delete() except: pass
def get_denominations(currencies): for currency in currencies: try: page = query_text_rendered(currency[1], "en") html = page['html'] query = pq(html) ths = query(".infobox:first th") for th in ths.items(): if "Banknotes" in th.text(): parent = th.parent() td_text = parent("td:first").text() if td_text != "": denom_text = td_text else: next = parent.next() denom_text = next("td").text() tmp_denom = re.findall("\d+,?\d+", denom_text) denominations = [] for denom in tmp_denom: denom = denom.replace(',', "") int_denom = int(denom) if not int_denom: denominations[-1] += denom else: denominations.append(denom) if len(denominations): currency.append(max([int(d) for d in denominations])) else: currency.append(None) print currency except KeyError: pass
def get_sentences(page_title): all_sents = [] txt = wikipydia.query_text_rendered(page_title) parse = BeautifulSoup(txt['html']) justtext = parse.get_text() #justtext = justtext.encode('utf-8') tok = nltk.tokenize.PunktSentenceTokenizer() sents0 = tok.tokenize(justtext) chunker = TagChunker(treebank_chunker()) i = 0 for s0 in sents0: i += 1 sents = s0.split('\n') for s in sents: verbfound = False nounfound = False ss = s.split() if(len(ss) > 0): tree = chunker.parse(nltk.pos_tag(ss)) for tag in [p[1] for p in tree.leaves()]: if(tag[0] == 'V'): verbfound = True break if(verbfound): for tag in [p[1] for p in tree.pos()]: if(tag == 'NP'): nounfound = True break if(verbfound and nounfound): all_sents.append(remove_hlinks(s)) return all_sents
def handle_noargs(self, **options): articles_of_interest = ArticleOfInterest.objects.all() for article in articles_of_interest: article_dict = query_text_rendered(article.title, language=article.title_language) # don't import articles we already have if SourceArticle.objects.filter(doc_id__exact='%s' % article_dict['revid'], language=article.title_language): continue try: source_article = SourceArticle( title=article.title, language=article.title_language, source_text=article_dict['html'], timestamp=datetime.now(), doc_id=article_dict['revid']) source_article.save() tr = TranslationRequest( article=source_article, target_language=article.target_language, date=datetime.now(), translator=DEFAULT_TRANNY) tr.save() except Exception as e: print type(e) print e.args try: source_article.delete() tr.delete() except: pass
def handle_noargs(self, **options): articles_of_interest = ArticleOfInterest.objects.all() for article in articles_of_interest: # don't import articles we already have if SourceArticle.objects.filter(title__exact='%s' % article.title, language=article.title_language): continue article_dict = query_text_rendered( article.title, language=article.title_language.code) try: source_article = SourceArticle( title=article.title, language=article.title_language, source_text=article_dict['html'], timestamp=datetime.now(), doc_id=article_dict['revid'] ) source_article.save() except Exception as e: print "Looks like we have an exception of type %s" % type(e) print "Exception args:", e.args try: source_article.delete() except: pass
def collect_wiki_corpus(language, lang, articles, splitters_folder): """ Download <n> random wikipedia articles in language <lang> """ filename = "%s%s.plain" % (splitters_folder,language) out = codecs.open(filename, "w", "utf-8") for title in articles: title=unquote(title) print ">> ",title print unquote(title) try: article_dict = wikipydia.query_text_rendered(title, language=lang) logging.debug("Training on: %s" % (unquote(title))) # Soup it soup = BeautifulSoup(article_dict['html']) p_text = '' for p in soup.findAll('p'): only_p = p.findAll(text=True) p_text = ''.join(only_p) # Tokenize but keep . at the end of words p_tokenized = ' '.join(PunktWordTokenizer().tokenize(p_text)) out.write(p_tokenized) out.write("\n") except KeyError: logging.error("tokenizer training error") #except: #logging.error("some weird (JSON) error") out.close()
def get_curencies(): currencies = [] page = query_text_rendered("List_of_circulating_currencies", "en") html = page['html'] query = pq(html) table = query("table:first") trs = table("tr") for tr in trs: tds = tr.findall("td") if len(tds): iso_code = tds[-3].text link = tds[-5].find("a") if link is not None and iso_code is not None: url = link.attrib['href'][6:] currency = [iso_code, url] if currency not in currencies: currencies.append(currency) return currencies
def debug_html(path): qterms = get_query_terms(path) #tmp = open('cntrl.tmp', 'w') k = qterms.keys()[1] txt = wikipydia.query_text_rendered(qterms[k]) parse = BeautifulSoup(txt['html']) justtext = parse.get_text() #os.system("python html2text.py < cntrl.tmp > html.tmp") #os.remove('cntrl.tmp') html = "" #for line in open("html.tmp").readlines(): # html += line #sents = html.split("\\n") tok = nltk.tokenize.PunktSentenceTokenizer() sents = tok.tokenize(justtext) i = 0 for s in range(0, 100): print sents[s] return
def request_article(request, form_class=ArticleRequestForm, template_name="wt_articles/request_form.html"): if request.method == "POST": article_request_form = form_class(request.POST) if article_request_form.is_valid(): title = article_request_form.cleaned_data['title'] title_language = article_request_form. \ cleaned_data['title_language'] if not ArticleOfInterest.objects.filter( title__exact=title, title_language__exact=title_language): article_of_interest = article_request_form.save(commit=False) article_of_interest.date = datetime.now() article_of_interest.save() article_dict = query_text_rendered( title, language=title_language.code) request_form = form_class() return {"article_requested": True, "request_form": request_form} else: request_form = form_class() return {"article_requested": False, "request_form": request_form}
def get_sentences(page_title): all_sents = [] #tmp = open('cntrl.tmp', 'w') txt = wikipydia.query_text_rendered(page_title) parse = BeautifulSoup(txt['html']) justtext = parse.get_text() #os.system("python html2text.py < cntrl.tmp > html.tmp") #os.remove('cntrl.tmp') html = "" #for line in open("html.tmp").readlines(): # html += line #sents = html.split("\\n") tok = nltk.tokenize.PunktSentenceTokenizer() sents = tok.tokenize(justtext) i = 0 for s in sents: if(not(s == "")): all_sents.append(remove_hlinks(s)) #s = s.strip() #ss = tok.tokenize(s) #for sss in ss: # all_sents.append(remove_hlinks(sss)) #i += 1 return all_sents
def collect_wiki_corpus(language, lang, num_items): """ Download <n> random wikipedia articles in language <lang> """ filename = "%s.plain" % (language) out = codecs.open(filename, "w", "utf-8") for title in query_random_titles(lang, num_items): article_dict = query_text_rendered(title, language=lang) # Soup it soup = BeautifulSoup(article_dict['html']) p_text = '' for p in soup.findAll('p'): only_p = p.findAll(text=True) p_text = ''.join(only_p) # Tokenize but keep . at the end of words p_tokenized = ' '.join(PunktWordTokenizer().tokenize(p_text)) out.write(p_tokenized) out.write("\n") out.close()
def request_translation(request, form_class=TranslationRequestForm, template_name="wt_articles/request_form.html", deletedId= -1, deleteAll = False, update = False): """ deletedId in this context is the deleted article id """ #Update if(update): from wikipydia import query_text_rendered, query_text_raw from wt_articles import DEFAULT_TRANNY if request.POST: post = request.POST.copy() user_form = UserForm(post, instance=request.user) if user_form.is_valid(): user_form.save() response = redirect('/accounts/' + request.user.username) else: articles_of_interest = ArticleOfInterest.objects.all() for article in articles_of_interest: if SourceArticle.objects.filter(title=article.title, language=article.title_language): continue #article_dict = query_text_raw(article.title, # language=article.title_language) article_dict = query_text_rendered(article.title, language=article.title_language) print(article.title, article.title_language) try: source_article = SourceArticle(title=article.title, language=article.title_language, #source_text=article_dict['text'], source_text=article_dict['html'], timestamp=datetime.now(), doc_id=article_dict['revid']) source_article.save() tr = TranslationRequest(article=source_article, target_language=article.target_language, date=datetime.now(), translator=DEFAULT_TRANNY) tr.save() except Exception as e: print type(e) print e.args try: source_article.delete() tr.delete() except: pass ###Delete if(deletedId != -1): article = ArticleOfInterest.objects.filter(id=deletedId) article.delete() if(deleteAll): ArticleOfInterest.objects.all().delete() if request.method == "POST": request_form = form_class(request.POST) if request_form.is_valid(): title = request_form.cleaned_data['title'] title_language = request_form.cleaned_data['title_language'] target_language = request_form.cleaned_data['target_language'] exists = ArticleOfInterest.objects.filter(title__exact=title, title_language__exact=title_language, target_language__exact=target_language) if len(exists) < 1: translation_request = request_form.save(commit=False) translation_request.date = datetime.now() translation_request.save() #return render_to_response("wt_articles/requests_thankyou.html", {}, # context_instance=RequestContext(request)) else: request_form = form_class() articles = all_articles_of_interest() return render_to_response(template_name, { "request_form": request_form, "articles": articles, }, context_instance=RequestContext(request))