Example #1
0
def process_upload(doc):
    file_path = filehandler.open_doc(doc)
    file_size = os.stat(file_path).st_size # because browser might not have sent content_length
    logger.debug("Upload: %d bytes", file_size)
    words = filehandler.convert_to_txt(file_path)
    filehandler.delete_file(file_path)
    return words
Example #2
0
def process_upload(doc):
    file_path = filehandler.open_doc(doc)
    file_size = os.stat(file_path).st_size # because browser might not have sent content_length
    logger.debug("Upload: %d bytes", file_size)
    words = filehandler.convert_to_txt(file_path)
    filehandler.delete_file(file_path)
    return words
Example #3
0
 def test_too_many_counts(self):
     fixture_path = os.path.join(self._fixtures_dir,'22kAmazonGameReview.txt')
     words = filehandler.convert_to_txt(fixture_path)
     counts = wordhandler.get_word_counts(words,True,True,'english')
     self.assertEqual(len(counts[0]),wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[1]),wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[2]),wordhandler.MAX_ITEMS)
 def test_too_many_counts(self):
     fixture_path = os.path.join(self._fixtures_dir,
                                 '22kAmazonGameReview.txt')
     words = filehandler.convert_to_txt(fixture_path)
     counts = wordhandler.get_word_counts(words, True, True, 'english')
     self.assertEqual(len(counts[0]), wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[1]), wordhandler.MAX_ITEMS)
     self.assertEqual(len(counts[2]), wordhandler.MAX_ITEMS)
Example #5
0
def process_results(file_paths, titles, sample_id, source):
    file_names = filehandler.get_file_names(file_paths)
    file_sizes = [ str(os.stat(file_path).st_size) for file_path in file_paths ] # because browser might not have sent content_length
    logger.debug("Upload: %s bytes", ", ".join(file_sizes))
    doc_list = [ filehandler.convert_to_txt(file_path) for file_path in file_paths ]
    data = textanalysis.common_and_unique_word_freqs(doc_list)
    job_id = mongo.save_samediff('samediff', file_names, 
        data['doc1total'], data['doc2total'],
        data['doc1unique'], data['doc2unique'],
        data['common'], data['common_counts'],
        data['doc1'], data['doc2'], data['cosine_similarity'],
        titles,
        sample_id,
        source)
    return redirect(request.url + 'results/' + job_id + '?submit=true')
Example #6
0
def process_results(file_paths, titles, sample_id, source):
    file_names = filehandler.get_file_names(file_paths)
    file_sizes = [ str(os.stat(file_path).st_size) for file_path in file_paths ] # because browser might not have sent content_length
    logger.debug("Upload: %s bytes", ", ".join(file_sizes))
    doc_list = [ filehandler.convert_to_txt(file_path) for file_path in file_paths ]
    data = textanalysis.common_and_unique_word_freqs(doc_list)
    job_id = mongo.save_samediff('samediff', file_names, 
        data['doc1total'], data['doc2total'],
        data['doc1unique'], data['doc2unique'],
        data['common'], data['common_counts'],
        data['doc1'], data['doc2'], data['cosine_similarity'],
        titles,
        sample_id,
        source)
    return redirect(request.url + 'results/' + job_id + '?submit=true')
Example #7
0
 def test_convert_to_txt_latin1(self):
     fixture_path = os.path.join(self._fixtures_dir,'latin-1.txt')
     text = filehandler.convert_to_txt(fixture_path) 
     self.assertEqual(len(text),860)
Example #8
0
 def test_convert_to_txt_utf8(self):
     fixture_path = os.path.join(self._fixtures_dir,'utf-8.txt')
     text = filehandler.convert_to_txt(fixture_path) 
     self.assertEqual(len(text),7159)
Example #9
0
def index():

    words = None

    forms = OrderedDict()
    forms['sample'] = WordCounterSample(g.current_lang)
    forms['paste'] = WordCounterPaste('I am Sam\nSam I am\nThat Sam-I-am!\nThat Sam-I-am!\nI do not like that Sam-I-am!\nDo you like \ngreen eggs and ham?\nI do not like them, Sam-I-am.\nI do not like\ngreen eggs and ham.\nWould you like them \nhere or there?\nI would not like them\nhere or there.\nI would not like them anywhere.')
    forms['upload'] = WordCounterUpload()
    forms['link'] = WordCounterLink()



    if request.method == 'POST':
        ignore_case = True
        ignore_stopwords = True
        
        btn_value = request.form['btn']
        sample_id = ''

        if btn_value == 'paste':
            words = forms['paste'].data['area']
            ignore_case = forms[btn_value].data['ignore_case_paste']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_paste']
            logger.debug("New from paste: %d chars", len(words) )
            title = _('your text')
        elif btn_value == 'upload':
            upload_file = forms['upload'].data['upload']
            words = process_upload(upload_file)
            ignore_case = forms[btn_value].data['ignore_case_upload']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_upload']
            title = upload_file.filename
            logger.debug("New from upload: %s", title )
        elif btn_value == 'sample':
            sample_source = forms['sample'].data['sample']
            samplename = filehandler.get_sample_title(sample_source)
            title = samplename
            ignore_case = forms[btn_value].data['ignore_case_sample']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_sample']
            sample_id = title+str(ignore_case)+str(ignore_stopwords)
            existing_doc_id = mongo.results_for_sample('wordcounter',sample_id)
            if existing_doc_id is not None:
                logger.debug("Existing from sample: %s", sample_source)
                return redirect(request.url + 'results/' + existing_doc_id)
            logger.info("New from sample: %s", sample_source)
            sample_path = filehandler.get_sample_path(sample_source)
            logger.debug("  loading from %s", sample_path)
            words = filehandler.convert_to_txt(sample_path)
        elif btn_value == 'link':
            url = forms['link'].data['link']
            # TODO: should actually accept https
            if 'https://' in url:
                url = url.replace('https', 'http')
            elif not 'http://' in url:
                url = 'http://' + url
            logger.debug("New from link: %s", url)
            content = filehandler.download_webpage(url)
            words = content['text']
            ignore_case = forms[btn_value].data['ignore_case_link']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_link']
            title = _(content['title'])

        if words is not None:
            logger.debug("  about to process words")
            counts = process_words(words, ignore_case, ignore_stopwords, btn_value=='sample')
            logger.debug("  finished counts, about to save")
            doc_id = mongo.save_words('wordcounter', counts, ignore_case, ignore_stopwords, title, sample_id, btn_value)
            logger.debug("  saved")
            return redirect(request.url + 'results/' + doc_id + '?submit=true')

    return render_template('wordcounter.html', forms=forms.items(), tool_name='wordcounter', max_file_size_in_mb = g.max_file_size_mb)
Example #10
0
def index():

    words = None

    forms = OrderedDict()
    forms['sample'] = WordCounterSample(g.current_lang)
    forms['paste'] = WordCounterPaste(
        'I am Sam\nSam I am\nThat Sam-I-am!\nThat Sam-I-am!\nI do not like that Sam-I-am!\nDo you like \ngreen eggs and ham?\nI do not like them, Sam-I-am.\nI do not like\ngreen eggs and ham.\nWould you like them \nhere or there?\nI would not like them\nhere or there.\nI would not like them anywhere.'
    )
    forms['upload'] = WordCounterUpload()
    forms['link'] = WordCounterLink()

    if request.method == 'POST':
        ignore_case = True
        ignore_stopwords = True

        btn_value = request.form['btn']
        sample_id = ''
        extras_to_save = {}

        if btn_value == 'paste':
            words = forms['paste'].data['area']
            ignore_case = forms[btn_value].data['ignore_case_paste']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_paste']
            logger.debug("New from paste: %d chars", len(words))
            title = _('your text')
        elif btn_value == 'upload':
            upload_file = forms['upload'].data['upload']
            words = process_upload(upload_file)
            ignore_case = forms[btn_value].data['ignore_case_upload']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_upload']
            title = upload_file.filename
            logger.debug("New from upload: %s", title)
        elif btn_value == 'sample':
            sample_source = forms['sample'].data['sample']
            samplename = filehandler.get_sample_title(sample_source)
            title = samplename
            ignore_case = forms[btn_value].data['ignore_case_sample']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_sample']
            sample_id = title + str(ignore_case) + str(ignore_stopwords)
            existing_doc_id = mongo.results_for_sample('wordcounter',
                                                       sample_id)
            if existing_doc_id is not None:
                logger.debug("Existing from sample: %s", sample_source)
                return redirect(request.url + 'results/' + existing_doc_id)
            logger.info("New from sample: %s", sample_source)
            sample_path = filehandler.get_sample_path(sample_source)
            logger.debug("  loading from %s", sample_path)
            words = filehandler.convert_to_txt(sample_path)
            extras_to_save = filehandler.get_sample(sample_source)
        elif btn_value == 'link':
            url = forms['link'].data['link']
            # TODO: should actually accept https
            if 'https://' in url:
                url = url.replace('https', 'http')
            elif not 'http://' in url:
                url = 'http://' + url
            logger.debug("New from link: %s", url)
            content = filehandler.download_webpage(url)
            words = content['text']
            ignore_case = forms[btn_value].data['ignore_case_link']
            ignore_stopwords = forms[btn_value].data['ignore_stopwords_link']
            title = _(content['title'])

        if words is not None:
            logger.debug("  about to process words")
            counts = _process_words(words, ignore_case, ignore_stopwords,
                                    btn_value == 'sample')
            logger.debug("  finished counts, about to save")
            doc_id = mongo.save_words('wordcounter',
                                      counts, ignore_case, ignore_stopwords,
                                      str(title), sample_id, btn_value,
                                      extras_to_save)
            logger.debug("  saved")
            return redirect(request.url + 'results/' + doc_id + '?submit=true')

    return render_template('wordcounter.html',
                           forms=list(forms.items()),
                           tool_name='wordcounter',
                           max_file_size_in_mb=g.max_file_size_mb)
Example #11
0
 def test_convert_to_txt_latin1(self):
     fixture_path = os.path.join(self._fixtures_dir, 'latin-1.txt')
     text = filehandler.convert_to_txt(fixture_path)
     self.assertEqual(len(text), 860)
Example #12
0
 def test_convert_to_txt_utf8(self):
     fixture_path = os.path.join(self._fixtures_dir, 'utf-8.txt')
     text = filehandler.convert_to_txt(fixture_path)
     self.assertEqual(len(text), 7159)