def process_upload(doc): file_path = filehandler.open_doc(doc) file_size = os.stat(file_path).st_size # because browser might not have sent content_length logger.debug("Upload: %d bytes", file_size) words = filehandler.convert_to_txt(file_path) filehandler.delete_file(file_path) return words
def test_too_many_counts(self): fixture_path = os.path.join(self._fixtures_dir,'22kAmazonGameReview.txt') words = filehandler.convert_to_txt(fixture_path) counts = wordhandler.get_word_counts(words,True,True,'english') self.assertEqual(len(counts[0]),wordhandler.MAX_ITEMS) self.assertEqual(len(counts[1]),wordhandler.MAX_ITEMS) self.assertEqual(len(counts[2]),wordhandler.MAX_ITEMS)
def test_too_many_counts(self): fixture_path = os.path.join(self._fixtures_dir, '22kAmazonGameReview.txt') words = filehandler.convert_to_txt(fixture_path) counts = wordhandler.get_word_counts(words, True, True, 'english') self.assertEqual(len(counts[0]), wordhandler.MAX_ITEMS) self.assertEqual(len(counts[1]), wordhandler.MAX_ITEMS) self.assertEqual(len(counts[2]), wordhandler.MAX_ITEMS)
def process_results(file_paths, titles, sample_id, source): file_names = filehandler.get_file_names(file_paths) file_sizes = [ str(os.stat(file_path).st_size) for file_path in file_paths ] # because browser might not have sent content_length logger.debug("Upload: %s bytes", ", ".join(file_sizes)) doc_list = [ filehandler.convert_to_txt(file_path) for file_path in file_paths ] data = textanalysis.common_and_unique_word_freqs(doc_list) job_id = mongo.save_samediff('samediff', file_names, data['doc1total'], data['doc2total'], data['doc1unique'], data['doc2unique'], data['common'], data['common_counts'], data['doc1'], data['doc2'], data['cosine_similarity'], titles, sample_id, source) return redirect(request.url + 'results/' + job_id + '?submit=true')
def test_convert_to_txt_latin1(self): fixture_path = os.path.join(self._fixtures_dir,'latin-1.txt') text = filehandler.convert_to_txt(fixture_path) self.assertEqual(len(text),860)
def test_convert_to_txt_utf8(self): fixture_path = os.path.join(self._fixtures_dir,'utf-8.txt') text = filehandler.convert_to_txt(fixture_path) self.assertEqual(len(text),7159)
def index(): words = None forms = OrderedDict() forms['sample'] = WordCounterSample(g.current_lang) forms['paste'] = WordCounterPaste('I am Sam\nSam I am\nThat Sam-I-am!\nThat Sam-I-am!\nI do not like that Sam-I-am!\nDo you like \ngreen eggs and ham?\nI do not like them, Sam-I-am.\nI do not like\ngreen eggs and ham.\nWould you like them \nhere or there?\nI would not like them\nhere or there.\nI would not like them anywhere.') forms['upload'] = WordCounterUpload() forms['link'] = WordCounterLink() if request.method == 'POST': ignore_case = True ignore_stopwords = True btn_value = request.form['btn'] sample_id = '' if btn_value == 'paste': words = forms['paste'].data['area'] ignore_case = forms[btn_value].data['ignore_case_paste'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_paste'] logger.debug("New from paste: %d chars", len(words) ) title = _('your text') elif btn_value == 'upload': upload_file = forms['upload'].data['upload'] words = process_upload(upload_file) ignore_case = forms[btn_value].data['ignore_case_upload'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_upload'] title = upload_file.filename logger.debug("New from upload: %s", title ) elif btn_value == 'sample': sample_source = forms['sample'].data['sample'] samplename = filehandler.get_sample_title(sample_source) title = samplename ignore_case = forms[btn_value].data['ignore_case_sample'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_sample'] sample_id = title+str(ignore_case)+str(ignore_stopwords) existing_doc_id = mongo.results_for_sample('wordcounter',sample_id) if existing_doc_id is not None: logger.debug("Existing from sample: %s", sample_source) return redirect(request.url + 'results/' + existing_doc_id) logger.info("New from sample: %s", sample_source) sample_path = filehandler.get_sample_path(sample_source) logger.debug(" loading from %s", sample_path) words = filehandler.convert_to_txt(sample_path) elif btn_value == 'link': url = forms['link'].data['link'] # TODO: should actually accept https if 'https://' in url: url = url.replace('https', 'http') elif not 'http://' in url: url = 'http://' + url logger.debug("New from link: %s", url) content = filehandler.download_webpage(url) words = content['text'] ignore_case = forms[btn_value].data['ignore_case_link'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_link'] title = _(content['title']) if words is not None: logger.debug(" about to process words") counts = process_words(words, ignore_case, ignore_stopwords, btn_value=='sample') logger.debug(" finished counts, about to save") doc_id = mongo.save_words('wordcounter', counts, ignore_case, ignore_stopwords, title, sample_id, btn_value) logger.debug(" saved") return redirect(request.url + 'results/' + doc_id + '?submit=true') return render_template('wordcounter.html', forms=forms.items(), tool_name='wordcounter', max_file_size_in_mb = g.max_file_size_mb)
def index(): words = None forms = OrderedDict() forms['sample'] = WordCounterSample(g.current_lang) forms['paste'] = WordCounterPaste( 'I am Sam\nSam I am\nThat Sam-I-am!\nThat Sam-I-am!\nI do not like that Sam-I-am!\nDo you like \ngreen eggs and ham?\nI do not like them, Sam-I-am.\nI do not like\ngreen eggs and ham.\nWould you like them \nhere or there?\nI would not like them\nhere or there.\nI would not like them anywhere.' ) forms['upload'] = WordCounterUpload() forms['link'] = WordCounterLink() if request.method == 'POST': ignore_case = True ignore_stopwords = True btn_value = request.form['btn'] sample_id = '' extras_to_save = {} if btn_value == 'paste': words = forms['paste'].data['area'] ignore_case = forms[btn_value].data['ignore_case_paste'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_paste'] logger.debug("New from paste: %d chars", len(words)) title = _('your text') elif btn_value == 'upload': upload_file = forms['upload'].data['upload'] words = process_upload(upload_file) ignore_case = forms[btn_value].data['ignore_case_upload'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_upload'] title = upload_file.filename logger.debug("New from upload: %s", title) elif btn_value == 'sample': sample_source = forms['sample'].data['sample'] samplename = filehandler.get_sample_title(sample_source) title = samplename ignore_case = forms[btn_value].data['ignore_case_sample'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_sample'] sample_id = title + str(ignore_case) + str(ignore_stopwords) existing_doc_id = mongo.results_for_sample('wordcounter', sample_id) if existing_doc_id is not None: logger.debug("Existing from sample: %s", sample_source) return redirect(request.url + 'results/' + existing_doc_id) logger.info("New from sample: %s", sample_source) sample_path = filehandler.get_sample_path(sample_source) logger.debug(" loading from %s", sample_path) words = filehandler.convert_to_txt(sample_path) extras_to_save = filehandler.get_sample(sample_source) elif btn_value == 'link': url = forms['link'].data['link'] # TODO: should actually accept https if 'https://' in url: url = url.replace('https', 'http') elif not 'http://' in url: url = 'http://' + url logger.debug("New from link: %s", url) content = filehandler.download_webpage(url) words = content['text'] ignore_case = forms[btn_value].data['ignore_case_link'] ignore_stopwords = forms[btn_value].data['ignore_stopwords_link'] title = _(content['title']) if words is not None: logger.debug(" about to process words") counts = _process_words(words, ignore_case, ignore_stopwords, btn_value == 'sample') logger.debug(" finished counts, about to save") doc_id = mongo.save_words('wordcounter', counts, ignore_case, ignore_stopwords, str(title), sample_id, btn_value, extras_to_save) logger.debug(" saved") return redirect(request.url + 'results/' + doc_id + '?submit=true') return render_template('wordcounter.html', forms=list(forms.items()), tool_name='wordcounter', max_file_size_in_mb=g.max_file_size_mb)
def test_convert_to_txt_latin1(self): fixture_path = os.path.join(self._fixtures_dir, 'latin-1.txt') text = filehandler.convert_to_txt(fixture_path) self.assertEqual(len(text), 860)
def test_convert_to_txt_utf8(self): fixture_path = os.path.join(self._fixtures_dir, 'utf-8.txt') text = filehandler.convert_to_txt(fixture_path) self.assertEqual(len(text), 7159)