def handle_noargs(self, **options): articles_of_interest = ArticleOfInterest.objects.all() for article in articles_of_interest: article_dict = query_text_raw(article.title, language=article.title_language) # don't import articles we already have if SourceArticle.objects.filter(doc_id__exact='%s' % article_dict['revid'], language=article.title_language): continue try: source_article = SourceArticle(title=article.title, language=article.title_language, source_text=article_dict['text'], timestamp=datetime.now(), doc_id=article_dict['revid']) source_article.save() tr = TranslationRequest(article=source_article, target_language=article.target_language, date=datetime.now(), translator=DEFAULT_TRANNY) tr.save() except Exception as e: print type(e) print e.args try: source_article.delete() tr.delete() except: pass
def get_wiki(self, query, lang): try: title = wikipydia.opensearch(query, language=lang)[1][0].encode('utf8') text = wikipydia.query_text_raw(title, language=lang) return title, text['text'].encode('utf8') except IndexError: pass
def get_sentences_for_article(article, article_id, lang, sentence_filename, write_to_file=True): """ Converts the article to text, splits it into sentences. Appends the sentences to file """ wikimarkup = wikipydia.query_text_raw(article, lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True) if(write_to_file): sentences = write_lines_to_file(sentence_filename, sentences) tags = write_lines_to_file(sentence_filename + '.tags', tags) seg_ids = [] for i in range(0, len(sentences)): id = article_id + '_' + str(i) seg_ids.append(id) seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids) return sentences
def main(): optParse( trace__T=None, language__L='|'.join(l for p in languages for l in p), fromDump__D='', showType__S=None, withTags__W=None ) sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize if options.fromDump: if options.fromDump.endswith('.gz'): source = os.popen('zcat %s' % options.fromDump) else: source = open(options.fromDump) currentLines = [] for line in source: line = line.strip() if line.startswith('<title>'): print line elif line.startswith('<text'): currentLines.append(line.split('>',1)[1]) elif currentLines: if line.endswith('</text>'): currentLines.append(line.rsplit('<',1)[0]) print '\n'.join(wiki2sentences('\n'.join(currentLines), sent_detector,False)) currentLines = [] else: currentLines.append(line) else: for title in arguments: if title == 'Barack Obama' and options.language=='en': text = open('obama.src').read().decode('utf-8') else: text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text'] if options.withTags: for s,t in zip(*wiki2sentences(text,sent_detector,True)): print t[:4],s.encode('utf-8') else: print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')
def get_sentences_for_article(article, article_id, lang, sentence_filename, write_to_file=True): """ Converts the article to text, splits it into sentences. Appends the sentences to file """ wikimarkup = wikipydia.query_text_raw(article, lang)['text'] sentences, tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True) if (write_to_file): sentences = write_lines_to_file(sentence_filename, sentences) tags = write_lines_to_file(sentence_filename + '.tags', tags) seg_ids = [] for i in range(0, len(sentences)): id = article_id + '_' + str(i) seg_ids.append(id) seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids) return sentences
import datetime import wpTextExtractor import sys date = datetime.date(int((sys.argv[2])[:4]), int((sys.argv[2])[5:7]), int((sys.argv[2])[8:10])) negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3])) for article in negatives: print article.replace("_", " "), '\t', print '-1', '\t', first_sentence = '' paragraph = '' text = query_text_raw(article,sys.argv[1])['text'] sentences, tags = wpTextExtractor.wiki2sentences(text, determine_splitter(sys.argv[1]), True) for sent, tag in zip(sentences, tags): if first_sentence == '': first_sentence = '1' print sent.encode('utf-8').rstrip(), '\t', print sent.encode('utf-8'), if tag == "LastSentence": break print "" #negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3]))
def get_lang_links_context(lang_links, lang, max_items=settings["top_links"], num_context_sentences=settings["num_context_sentences"]): #(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]): #build vocabulary based on list of articles and compile context sentences for each word """ Extracts all of the non-English vocabulary from each of the pages, and retains up to the specified number of context sentences. The vocab is normaized by lowercasing and stripping punctuation. """ logging.info("getting context for interlanguage links") #add all unicode punctuation categories for exclusion all_chars=(unichr(i) for i in xrange(0x10000)) punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P') #punct_to_exclude= set(string.punctuation + "1234567890") punct_to_exclude= set(punct + "1234567890") links_with_context={} splitter=determine_splitter(lang) for i,en_article in enumerate(lang_links): logging.debug ("item # %s from %s, # of good links %s, # of links needed %s" % (i, en_article,len(links_with_context), max_items)) if len(links_with_context) >= max_items: break article = lang_links[en_article]["translation"] if use_as_gold_standard_translation(en_article, article, lang): logging.debug("link accepted %s - %s" % (en_article,article)) word = unicode(article, "UTF-8") try: wikimarkup = wikipydia.query_text_raw(article, lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True) for j,sentence in enumerate(sentences): if re.search(word, sentence): if not word in links_with_context: links_with_context[word] = {"context":[],"translation":en_article} if len(links_with_context[word]["context"]) < num_context_sentences: links_with_context[word]["context"].append(sentence) links_with_context[word]["translation"] = en_article else: break except KeyError: #logging.debug( u'no page for %s %s' % (article, lang)) print u'no page for ', article, lang except IOError: logging.debug( u'cannot reach %s %s' % (article, lang)) except TypeError: #logging.debug( u'unicode object error for %s %s' % (article, lang)) print 'unicode object error for', article, lang except UnicodeDecodeError: #logging.debug( u'unicode error for %s %s' % (article, lang)) print u'unicode error ', article, lang except: #logging.debug( u'somethign weird happened for %s %s' % (article, lang)) print u'somethign weird happened for ', article, lang else: logging.debug("link rejected %s - %s" % (en_article,article)) return links_with_context
def get_vocab(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]): #build vocabulary based on list of articles and compile context sentences for each word logging.info("generating vocabulary") #add all unicode punctuation categories for exclusion all_chars=(unichr(i) for i in xrange(0x10000)) punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P') #punct_to_exclude= set(string.punctuation + "1234567890") punct_to_exclude= set(punct + "1234567890") vocab={} num_articles=0 splitter=determine_splitter(lang) for i,article in enumerate(articles): try: wikimarkup = wikipydia.query_text_raw(articles[i], lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True) for sentence in sentences: sent = ''.join(ch for ch in sentence if ch not in punct_to_exclude) sent = sent.lower() words = sent.split(' ') for word in words: # filter words that are obviously non foreighn language (plain english or gibberish/non-alpha) #if not word in en_vocab: if len(word)<settings["min_letters"]: break if not word in vocab: vocab[word] = {"frequency":1,"context":[]} else: vocab[word]["frequency"]=vocab[word]["frequency"]+1 if len(vocab[word]["context"]) < num_context_sentences: vocab[word]["context"].append(sentence) num_articles = num_articles + 1 if num_articles >= max_articles: break except KeyError: #logging.debug( u'no page for %s %s' % (article, lang)) print u'no page for ', article, lang except IOError: #logging.debug( u'cannot reach %s %s' % (article, lang)) print u'cannot reach ', article, lang except TypeError: #logging.debug( u'unicode object error for %s %s' % (article, lang)) print u'unicode object error for ', article, lang except UnicodeDecodeError: #logging.debug( u'unicode error for %s %s' % (article, lang)) print u'unicode error for ', article, lang except: #logging.debug( u'somethign weird happened for %s %s' % (article, lang)) print u'somethign weird happened for ', article, lang logging.info("vocabulary size: %s" % (len(vocab))) return vocab