Exemple #1
0
 def test_handler(self):
     "Ensures that a handler registered for a given type is executed when that type is converted."
     def test_handler(f, type):
         return TEST
     fulltext.add('application/test', '.test', test_handler)
     self.assertEqual(fulltext.get('files/test.test'), TEST)
     self.assertEqual(fulltext.get(file('files/test.test', 'r')), TEST)
 def test_unknown_ext(self):
     # File with unknown extension == use bin backend.
     fname = self.touch('testfn.unknown')
     with mock.patch('fulltext.handle_path', return_value="") as m:
         fulltext.get(fname)
         klass = m.call_args[0][0]
         self.assertEqual(klass.__module__, 'fulltext.backends.__bin')
Exemple #3
0
    def handle_fobj(self, path_or_file):
        # Avoid circlar imports.
        from fulltext import get, backend_from_fname, backend_from_fobj

        f, path = self.get_fobj_and_path(path_or_file)
        with f:
            orig_name = orig_fname(path)
            if _has_ext(orig_name) and splitext(orig_name)[1].lower() != '.gz':
                backend = backend_from_fname(orig_name)
            else:
                backend = backend_from_fobj(f)

            try:
                return get(f, backend=backend)
            except Exception:
                # Some backends are not able to deal with gzip.GzipFile
                # instances so we copy the file on
                # disk. See: https://github.com/btimby/fulltext/issues/56
                LOGGER.info(
                    "%r backend could not handle gzip file object directly; "
                    "retrying by extracting the gzip on disk" % backend)

                f2, _ = self.get_fobj_and_path(path_or_file)
                ext = splitext(orig_name)[1]
                with f2:
                    with fobj_to_tempfile(f2, suffix=ext) as fname:
                        return get(fname, backend=backend)
Exemple #4
0
 def test_command(self):
     """Ensures that commands registered for a given type are executed by the `run_command` handler
     when that type is converted."""
     fulltext.add('application/test', '.test', fulltext.run_command, (('echo', TEST), ('echo', TEST), ))
     self.assertEqual(fulltext.get('files/test.test'), TEST)
     with open('files/test.test', 'r') as fo:
         self.assertEqual(fulltext.get(fo), TEST)
 def test_src_code_ext(self):
     fname = "file.js"
     self.touch(fname, content="foo bar")
     with mock.patch('fulltext.handle_path', return_value="") as m:
         fulltext.get(fname)
         klass = m.call_args[0][0]
         self.assertEqual(klass.__module__, 'fulltext.backends.__text')
 def test_text_ext(self):
     for ext in (".py", ".cpp", ".h", ".pl"):
         fname = self.touch("document%s" % ext)
         with mock.patch('fulltext.handle_path', return_value="") as m:
             fulltext.get(fname)
             klass = m.call_args[0][0]
             self.assertEqual(klass.__module__, 'fulltext.backends.__text')
Exemple #7
0
    def handle_fobj(self, path_or_file):
        # Avoid circlar imports.
        from fulltext import get, backend_from_fname, backend_from_fobj

        f, path = self.get_fobj_and_path(path_or_file)
        with f:
            orig_name = orig_fname(path)
            if _has_ext(orig_name) and splitext(orig_name)[1].lower() != '.gz':
                backend = backend_from_fname(orig_name)
            else:
                backend = backend_from_fobj(f)

            try:
                return get(f, backend=backend)
            except Exception:
                # Some backends are not able to deal with gzip.GzipFile
                # instances so we copy the file on
                # disk. See: https://github.com/btimby/fulltext/issues/56
                LOGGER.info(
                    "%r backend could not handle gzip file object directly; "
                    "retrying by extracting the gzip on disk" % backend)

                f2, _ = self.get_fobj_and_path(path_or_file)
                ext = splitext(orig_name)[1]
                with f2:
                    with fobj_to_tempfile(f2, suffix=ext) as fname:
                        return get(fname, backend=backend)
 def test_by_name_with_no_ext(self):
     # Assume bin backend is picked up.
     fname = self.touch("woodstock-no-ext")
     with mock.patch('fulltext.handle_path', return_value="") as m:
         fulltext.get(fname, name=fname)
         klass = m.call_args[0][0]
         self.assertEqual(klass.__module__, 'fulltext.backends.__bin')
 def test_by_backend(self):
     # Assert file ext is ignored if backend opt is used.
     fname = self.touch('testfn.doc')
     with mock.patch('fulltext.handle_path', return_value="") as m:
         fulltext.get(fname, backend='html')
         klass = m.call_args[0][0]
         self.assertEqual(klass.__module__, 'fulltext.backends.__html')
Exemple #10
0
 def test_params(self):
     # Make sure Backend class receives the right params.
     fname = self.touch('testfn.doc')
     with mock.patch('fulltext.handle_path', return_value="") as m:
         fulltext.get(fname, encoding='foo', encoding_errors='bar')
         klass = m.call_args[0][0]
         self.assertEqual(klass.encoding, 'foo')
         self.assertEqual(klass.encoding_errors, 'bar')
Exemple #11
0
 def test_name_attr(self):
     # Make sure that fulltext attempts to determine file name
     # from "name" attr of the file obj.
     f = tempfile.NamedTemporaryFile(suffix='.html')
     with mock.patch('fulltext.handle_fobj', return_value="") as m:
         fulltext.get(f)
         klass = m.call_args[0][0]
         self.assertEqual(klass.__module__, 'fulltext.backends.__html')
Exemple #12
0
 def test_html(self):
     fname = "file-noext"
     self.touch(fname, content=open(
         pathjoin(HERE, 'files/test.html'), 'rb').read())
     with mock.patch('fulltext.handle_path', return_value="") as m:
         fulltext.get(fname)
         klass = m.call_args[0][0]
         self.assertEqual(klass.__module__, 'fulltext.backends.__html')
Exemple #13
0
    def test_handler(self):
        "Ensures that a handler registered for a given type is executed when that type is converted."

        def test_handler(f, type):
            return TEST

        fulltext.add('application/test', '.test', test_handler)
        self.assertEqual(fulltext.get('files/test.test'), TEST)
        self.assertEqual(fulltext.get(file('files/test.test', 'r')), TEST)
Exemple #14
0
 def test_command(self):
     """Ensures that commands registered for a given type are executed by the `run_command` handler
     when that type is converted."""
     fulltext.add('application/test', '.test', fulltext.run_command, (
         ('echo', TEST),
         ('echo', TEST),
     ))
     self.assertEqual(fulltext.get('files/test.test'), TEST)
     self.assertEqual(fulltext.get(file('files/test.test', 'r')), TEST)
Exemple #15
0
    def test_register_backend_ext(self):
        fulltext.register_backend(
            'application/ijustmadethisup',
            'fulltext.backends.__html',
            extensions=['.ijustmadethisup'])

        fname = self.touch("document.ijustmadethisup")
        with mock.patch('fulltext.handle_path', return_value="") as m:
            fulltext.get(fname)
            klass = m.call_args[0][0]
            self.assertEqual(klass.__module__, 'fulltext.backends.__html')
Exemple #16
0
def brute_txt(fn):
    """
    Convert anything to txt
    """
    # if url, send there
    if not os.path.exists(fn):
            print('! No filename found')
            return ''
    # get ext
    ext=os.path.splitext(fn)[-1][1:]
    txt=''
    # epub
    if ext in {'epub'}:
        txt=epub2txt(fn)
    elif ext in {'xml','html','htm'}:
        with open(fn) as f:
            content=f.read()
            txt=xml2txt(content,CONTENT_TAGS[ext])
    elif ext in {'txt'}:
        with open(fn,'rb') as f:
            content=f.read()
            return to_unicode(content)
    elif ext in {'pdf'}:
        txt=pdf2txt(fn)
    else:
        import fulltext
        txt=fulltext.get(fn)
        if not txt: return ''
    # clean
    txt=txt.replace('\xa0', ' ') 
    if 'project gutenberg ebook' in txt.lower():
        txt=clean_gutenberg(txt)
    return txt
Exemple #17
0
 def extract_document_text(self, filename, encoding="iso-8859-13", language="est"):
     name, extension = os.path.splitext(filename)
     type = None
     if not extension in {".doc", ".docx", ".rtf", ".pdf", ".odt"}:
         type = ("application/msword", None)
     text = unicode(fulltext.get(filename, type=type), encoding=encoding)
     if extension == ".pdf" and len(text) == 0:
         process = subprocess.Popen(("pypdfocr", "-l", language, filename), close_fds=True)
         process.communicate()
         ocr_filename = "{}_ocr{}".format(name, extension)
         if os.path.isfile(ocr_filename):
             os.rename(ocr_filename, filename)
             text = unicode(fulltext.get(filename, type=type), encoding=encoding)
         else:
             print ("failed to ocr: {}".format(filename))
     return text
Exemple #18
0
def load_test(fi) -> tuple:
    test = fulltext.get(fi).replace('Choose the best word or phrase (a, b, c or d) to fill each blank.', '')
    if "answer sheet" in test.lower():
        test = test[:test.lower().find("answer sheet")]
    reading = test[test.lower().find("read the text below"):test.find("(21)")]
    test = test.replace(reading, "")
    return test, reading
Exemple #19
0
    def post(self, request, *args, **kwargs):
        global questions
        if questions == []:
            return Response({"status": "Keys can not be pasted before questions."}, status=status.HTTP_400_BAD_REQUEST)

        file_serializer = FileSerializer(data=request.data)
        if file_serializer.is_valid():
            fi = file_serializer.save()
            fi.file.open(mode='rb')
            with fi.file:
                answers = fulltext.get(fi.file)
                for line in answers.split('\n'):
                    if re.match(r'\(\d+\) [a-d]', line):
                        questions[
                            int(re.findall(r'\d+', line)[0]) - 1
                            ].answ_correct = re.findall(r'[a-d]', line)[0]
            TestQuestion.objects.all().delete()
            for question in questions:
                TestQuestion.objects.create(
                    number=question.number,
                    text=question.text,
                    answ_correct=question.answ_corr(),
                    answ_option1=question.answers[0],
                    answ_option2=question.answers[1],
                    answ_option3=question.answers[2],
                    answ_option4=question.answers[3],
                    is_reading=question.is_reading,
                )

            return HttpResponseRedirect(base_path.BASE_PATH + 'test_editor/')
        else:
            return Response(file_serializer.errors, status=status.HTTP_400_BAD_REQUEST)
Exemple #20
0
 def test_global_vars(self):
     # Make sure the globla vars are taken into consideration and
     # passed to the underlying backends.
     encoding, errors = fulltext.ENCODING, fulltext.ENCODING_ERRORS
     fname = self.touch("file.txt", content=b"hello")
     try:
         fulltext.ENCODING = "foo"
         fulltext.ENCODING_ERRORS = "bar"
         with mock.patch('fulltext.handle_path', return_value="") as m:
             fulltext.get(fname)
             klass = m.call_args[0][0]
             self.assertEqual(klass.encoding, 'foo')
             self.assertEqual(klass.encoding_errors, 'bar')
     finally:
         fulltext.ENCODING = encoding
         fulltext.ENCODING_ERRORS = errors
Exemple #21
0
 def _handle_text(self, f):
     """Main body of both 'text mode' tests."""
     try:
         text = fulltext.get(f, mime=self.mime)
         self.assertMultiLineEqual(self.text, text)
     finally:
         f.close()
Exemple #22
0
def upload_file():
    if request.method == 'POST':
        file = request.files['file']
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            str_data = []
            t = database_files()
            for k in t:
                text_from_file = ""
                try:
                    text_from_file = fulltext.get(
                        'uploads/' + str(k),
                        None).replace('\n', ' ').replace('\"',
                                                         "").replace("\'", "")
                except:
                    continue
                cosine_freq = cosine.cosinedatabaseTF(text_from_file)
                shingles = []
                for i in range(1, 4):
                    shingle = "{" + shinglmethods.genshingle_n(
                        text_from_file, i) + "}"
                    shingles.append(shingle)
                shingle_t = "\"shingles_by_id\": [" + ",".join(shingles) + "]"
                shingles_sorted = []
                for i in range(1, 4):
                    shingle_sorted = "{" + shinglmethods.genshingle_n(
                        text_from_file, i) + "}"
                    shingles_sorted.append(shingle_sorted)
                shingle_t_sorted = "\"shingles_sorted_by_id\": [" + ",".join(
                    shingles) + "]"
                moodles = []
                for i in range(1, 4):
                    moodle = "{" + moodlemethod.genmoodle_n(text_from_file,
                                                            i) + "}"
                    moodles.append(moodle)
                moodle_t = "\"moodles_by_id\": [" + ",".join(moodles) + "]"
                filepath = "txt/" + str(k).replace("doc", "").replace(
                    "docx", "").replace("txt", "") + "txt"
                hs = open(filepath, "w")
                str_data.append(
                    "{" +
                    "\"name\":\"{}\", \"filepath\":\"{}\", {}, {}, {}, {}".
                    format(str(k), filepath, cosine_freq, shingle_t,
                           shingle_t_sorted, moodle_t) + "}")
                hs.write(text_from_file)
                hs.close()
            text_for_json = ",".join(str_data)
            hj = open("database.json", "w")
            hj.write("[" + text_for_json + "]")
            hj.close()

            return redirect(url_for('uploaded_file', filename=filename))

    return '''
Exemple #23
0
 def test_text_strip(self):
     """Ensure that stripping works as expected."""
     file = BytesIO()
     file.write(b'  Test leading and trailing spaces removal.  ')
     file.write(b'Test @$%* punctuation removal! ')
     file.write(b'Test    spaces     removal! ')
     file.seek(0)
     stripped = fulltext.get(file, backend='bin')
     self.assertMultiLineEqual('Test leading and trailing spaces removal. '
                               'Test punctuation removal! Test spaces '
                               'removal!', stripped)
Exemple #24
0
def upload_file_for_check():
    if request.method == 'POST':
        file = request.files['file']
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join('/tmp', filename))
            text = fulltext.get('/tmp/' + filename)
            #return text
            return render_template('index.html', query=text)

    return '''
Exemple #25
0
    def test_callbacks(self):
        # Make sure callback methods are called (also in the right order).
        flags = []

        class Backend:

            def setup(self):
                flags.append("setup")

            def teardown(self):
                flags.append("teardown")

            def handle_fobj(self, path):
                flags.append("handle_fobj")
                return "text"

        fname = self.touch('testfn.doc')
        with mock.patch('fulltext.backend_inst_from_mod',
                        return_value=Backend()):
            fulltext.get(fname, encoding='foo', encoding_errors='bar')
        self.assertEqual(flags, ['setup', 'handle_fobj', 'teardown'])
Exemple #26
0
    def test_teardown_on_err(self):
        # Make sure teardown methods is called also on error.
        flags = []

        class Backend:

            def setup(self):
                flags.append("setup")

            def teardown(self):
                flags.append("teardown")

            def handle_fobj(self, path):
                1 / 0

        fname = self.touch('testfn.doc')
        with mock.patch('fulltext.backend_inst_from_mod',
                        return_value=Backend()):
            with self.assertRaises(ZeroDivisionError):
                fulltext.get(fname, encoding='foo', encoding_errors='bar')

        self.assertEqual(flags, ['setup', 'teardown'])
Exemple #27
0
    def handle_fobj(self, f):
        from fulltext import get  # avoid circular import
        with ExitStack() as stack:
            text = StringIO()
            archive = stack.enter_context(rarfile.RarFile(f))
            for f in archive.infolist():
                LOGGER.debug("extracting %s" % f.filename)

                rf = stack.enter_context(archive.open(f))
                ret = get(rf, name=f.filename, encoding=self.encoding,
                          encoding_errors=self.encoding_errors)
                text.write(ret)

            return text.getvalue()
Exemple #28
0
    def handle_fobj(self, f):
        from fulltext import get  # avoid circular import
        with ExitStack() as stack:
            text = StringIO()
            z = stack.enter_context(zipfile.ZipFile(f, 'r'))
            for name in sorted(z.namelist()):
                LOGGER.debug("extracting %s" % name)
                zf = stack.enter_context(z.open(name, 'r'))
                # Kinda hacky, but zipfile's open() does not handle "b" in
                # the mode.
                # We do this here to satisy an assertion in handle_fobj().
                zf.mode += 'b'
                text.write(get(zf, name=name))

            return text.getvalue()
Exemple #29
0
    def handle_fobj(self, f):
        from fulltext import get  # avoid circular import
        with ExitStack() as stack:
            text = StringIO()
            z = stack.enter_context(zipfile.ZipFile(f, 'r'))
            for name in sorted(z.namelist()):
                LOGGER.debug("extracting %s" % name)
                zf = stack.enter_context(z.open(name, 'r'))
                # Kinda hacky, but zipfile's open() does not handle "b" in
                # the mode.
                # We do this here to satisy an assertion in handle_fobj().
                zf.mode += 'b'
                text.write(get(zf, name=name))

            return text.getvalue()
Exemple #30
0
    def handle_fobj(self, f):
        from fulltext import get  # avoid circular import
        with ExitStack() as stack:
            text = StringIO()
            archive = stack.enter_context(rarfile.RarFile(f))
            for f in archive.infolist():
                LOGGER.debug("extracting %s" % f.filename)

                rf = stack.enter_context(archive.open(f))
                ret = get(rf,
                          name=f.filename,
                          encoding=self.encoding,
                          encoding_errors=self.encoding_errors)
                text.write(ret)

            return text.getvalue()
    def scanDoc(self, path):
        text = self.remove_non_ascii(fulltext.get(path))

        # Oke so maybe the pdf was actually an image
        if text == "":
            print(
                'Maybe pdf contained only images, trying to get text from image'
            )
            text = self.get_image_content(path)

        tokens = word_tokenize(text)

        punctuations = ['(', ')', ';', ':', '[', ']', ',', '.', "'", '@', '&']
        stop_words = stopwords.words('dutch')

        keywords = [
            word for word in tokens
            if not word in stop_words and not word in punctuations
        ]

        return " ".join(keywords)
Exemple #32
0
 def test_invalid_char(self):
     fname = pathjoin(HERE, "files/unicode/invalid.%s" % self.ext)
     if os.path.exists(fname):
         with self.assertRaises(UnicodeDecodeError):
             fulltext.get(fname)
         ret = fulltext.get(fname, encoding_errors="ignore")
         self.assertEqual(ret, self.invalid)
     #
     fname = pathjoin(HERE, "files/unicode/it.%s" % self.ext)
     with self.assertRaises(UnicodeDecodeError):
         fulltext.get(fname, encoding='ascii')
     ret = fulltext.get(
         fname, encoding='ascii', encoding_errors="ignore")
     against = self.italian.replace(
         u"àèìòù", u"").replace(u"  ", u" ").strip()
     self.assertEqual(ret, against)
Exemple #33
0
 def test_doc(self):
     self.assertEqual(fulltext.get('files/test.doc'), TEST)
Exemple #34
0
def _handle_open(path):
    with open(path, 'rb') as f:
        return fulltext.get(f)
Exemple #35
0
 def test_default_none(self):
     "Ensures None is a valid value to pass as default."
     self.assertEqual(fulltext.get('unknown-file.foobar', None), None)
Exemple #36
0
 def test_unknown_default(self):
     "Ensures an unknown file type will return default value instead of exception."
     self.assertEqual(fulltext.get('unknown-file.foobar', 'canary'), 'canary')
Exemple #37
0
def main():
    ###############################################################################
    # minu aadress
    ###############################################################################

    proj = Proj(init="epsg:3301")  # L-EST97 projektsioon

    # VAIKEKOHT,TANAV,KATASTRIYKSUS,EHITISHOONE
    querystring = {
        "dogis_link": "getgazetteer",
        "features": "EHITISHOONE",
        "results": "5"
    }

    address = raw_input("Sisesta otsitav aadress (või vajuta ENTER): ")
    if not address:
        # 25°24'14.586"E 59°29'9.822"N
        # 579555,6595094
        address = "Lõuna tee 15, Mäepea küla"

    # Maa-ameti teenus aadressiinfo (sealhulgas koordinaatide) saamiseks
    # http://geoportaal.maaamet.ee/est/Teenused/X-GIS-JSON-aadressiotsingu-teenuse-kirjeldus-p502.html
    querystring["address"] = address
    response = requests.request('GET', 'http://xgis.maaamet.ee/xGIS/XGis', params=querystring)

    addresses = defaultdict(list)
    try:
        json = response.json()["featureMember"]
        if isinstance(json, dict):
            get_address(json, addresses, proj)
        elif isinstance(json, list):
            for member in json:
                get_address(member, addresses, proj)
    except (ValueError, KeyError):
        print "ei leitud", response.status_code

    response.close()

    tunnus, address, lest97, geo = addresses["EHITISHOONE"][0]
    print "======================================================================="
    print "Leitud aadress:"
    print address
    print "Koordinaadid:"
    pprint(geo)  # geograafilised koordinaadid
    pprint(lest97)  # projekteeritud koordinaadid
    print "======================================================================="

    raw_input("(vajuta ENTER):")

    ###############################################################################
    # amphora teemad
    # Planeerimine ja ehitus - 5059 
    # Detailplaneeringute algatamine - 50285 
    # Detailplaneeringute kehtestamine - 50286 
    # Detailplaneeringute vastuvõtmine - 50287 
    # Projekteerimistingimuste määramine - 50288 
    # Maakorraldus – 50344    
    ###############################################################################

    # kuusalu valla dokumendiregister
    url = "http://server.amphora.ee/atp/kuusaluvv/AmphoraPublic.asmx"
    headers = {
        "content-type": "application/x-www-form-urlencoded"
    }

    ###############################################################################

    payload = {
        "type": "DOCUMENT",
        "topicID": "5059",
        "maxRows": "20",
        "unitID": "",
        "folderID": "",
        "formID": "",
        "phrase": "",
        "startRowIndex": "",
        "detailMetadata": ""
    }

    # dokumentide nimekirja päring registrist
    articles = dict()
    response = requests.post(url + "/GetItemList", data=payload, headers=headers, stream=True)
    response.raw.decode_content = True

    # 259449 ja 259430 on suured dokumendid, need jäetakse demo mõttes hetkel välja
    for event, element in et.iterparse(response.raw):
        if get_element_tag(element.tag) == "sys_id" and element.text != "259449" and element.text != "259430":
            articles[element.text] = dict()
        element.clear()

    response.close()

    ###############################################################################
    # dokumendid
    ###############################################################################

    articles_folder = os.path.dirname(os.path.abspath(__file__)) + "/documents/"
    if os.path.exists(articles_folder):
        shutil.rmtree(articles_folder)
    os.makedirs(articles_folder)

    # üksikute dokumentide metadata ja faili päring
    progress = tqdm(articles)
    for key in progress:
        progress.set_description("Dokumendid %s" % key)
        payload = {
            "id": key,
            "maxDepth": "0"
        }

        # dokumendi päring
        response = requests.request("POST", url + "/GetItem", data=payload, headers=headers, stream=True)
        response.raw.decode_content = True

        path = deque()
        content = None
        filename = None
        filetype = None
        for event, element in et.iterparse(response.raw, events=("start", "end")):
            element_tag = get_element_tag(element.tag)
            if event == "start":
                path.append(element_tag)
            elif event == "end":
                if "file" in path:
                    if element_tag == "data":
                        content = base64.decodestring(element.text or "")
                    elif element_tag == "filename":
                        filename = element.text
                    elif element_tag == "type":
                        filetype = element.text
                if element_tag == "field" and "name" in element.attrib and element.attrib["name"] == "Caption":
                    articles[key]["title"] = element.text
                path.pop()
                element.clear()

            if content is not None and filename is not None and filetype == "MAIN_FILE":
                _, extension = os.path.splitext(filename)
                articles[key]["file"] = key + extension
                out = open(articles_folder + key + extension, "wb")  # faili salvestamine
                out.write(content)
                out.close()
                break
        else:
            print "ei sisalda faili", key

        response.close()

    print "======================================================================="

    ###############################################################################
    # failid
    ###############################################################################

    # Maa-ameti teenus koordinaatide pärimiseks katastrinumbri järgi
    # http://geoportaal.maaamet.ee/est/Teenused/Poordumine-kaardirakendusse-labi-URLi-p9.html#a13
    url = "http://geoportaal.maaamet.ee/url/xgis-ky.php"
    querystring = {
        "what": "tsentroid",
        "out": "json"
    }

    # kauguse arvutus projekteeritud koordinaatidega
    point = lambda coordinate: float(coordinate)
    distance = lambda src, dest: math.sqrt(
        (point(src[0]) - point(dest[0])) ** 2 + (point(src[1]) - point(dest[1])) ** 2)

    pattern = re.compile("\d{5}:\d{3}:\d{4}")  # regexp katastrinumbri leindmiseks tekstist
    _, _, x_y, _ = addresses["EHITISHOONE"][0]  # minu aadress
    progress = tqdm(articles)
    for key in progress:
        progress.set_description("Koordinaadid %s" % key)
        articles[key]["katastrinumbrid"] = list()
        if "file" not in articles[key]:
            continue
        text = fulltext.get(articles_folder + articles[key]["file"])  # pdf, doc ja rtf failide konverteerimine tekstiks
        katastrinumbrid = set(pattern.findall(text))
        for number in katastrinumbrid:
            querystring["ky"] = number
            response = requests.request("GET", url, params=querystring)  # koordinaatide päring maaametist
            try:
                json = response.json()["1"]
                distance_km = distance(x_y, (json["X"], json["Y"])) / 1000  # katastrinumbri kaugus minu aadressist
                longitude, latitude = proj(json["X"], json["Y"],
                                           inverse=True)  # katastrinumbri geograafilised koordinaadid (saab otse kopeerida google mapsi)
                articles[key]["katastrinumbrid"].append(
                    (number, (json["X"], json["Y"]), distance_km, (latitude, longitude)))
            except (ValueError, KeyError):
                print "koordinaate ei leitud", response.status_code, key, number
            response.close()

    print "======================================================================="

    for key in articles:
        if articles[key]["katastrinumbrid"]:
            print "---------------"
            print "Dokument:", key, articles[key]["title"]
            print "---------------"
            for number in articles[key]["katastrinumbrid"]:
                knumber, lest97, kaugus, (latitude, longitude) = number
                print knumber, " koordinaadid:", latitude, ",", longitude, " kaugus minu aadressist:", kaugus
            print "======================================================================="
Exemple #38
0
def download_file(d):
    document_link = DocumentLink.objects.get(pk=d.pk)
    doc, created = Document.objects.get_or_create(document_link=document_link)
    extensions = (
        'doc',
        'pdf',
        'docx',
        'xls',
    )
    document_content = ''

    if d.url:
        '''Define the working Directory and saving Path'''
        wk_dir = os.path.dirname(os.path.realpath('__file__'))
        save_path = wk_dir + "/docstore/"

        '''Unshort URLs and get file name'''
        r = requests.head(d.url, allow_redirects=True)
        if d.url != r.url:
            long_url = r.url
        else:
            long_url = d.url
        doc.long_url = long_url
        local_filename = long_url.split('/')[-1]
        doc.document_name = local_filename

        '''Verify if the the URL is containing a file and authorize download'''
        file_extension = local_filename.split('.')[-1].lower()
        save_name = str(d.pk) + '.' + file_extension
        document_path = save_path + save_name
        is_downloaded = False

        if file_extension in extensions:
            if created or (not created and not doc.is_downloaded):
                doc.url_is_valid = True
                downloader = DownloadFile(long_url, document_path)
                try:
                    is_downloaded = downloader.download()
                    doc.is_downloaded = is_downloaded
                except Exception as e:
                    # print str(e)
                    pass

                '''Get Text from file and save document'''
                if is_downloaded:
                    doc.long_url_hash = hashlib.md5(long_url).hexdigest()
                    doc.file_hash = hash_file(document_path)
                    document_content = fulltext.get(
                        save_path + save_name, '< no content >')
                    doc.document_content = document_content

            if (not created and doc.is_downloaded):
                '''prepare the updated file storage with the new name \
                        <update.timestamp.id.extention'''
                ts = time.time()
                document_path_update = save_path + "update." + str(ts) + "." + save_name  # NOQA: E501
                downloader = DownloadFile(long_url, document_path_update)
                try:
                    is_downloaded = downloader.download()
                except Exception as e:
                    # print str(e)
                    pass
                '''hash the downloaded file and it long url'''
                if is_downloaded:
                    long_url_hash = hashlib.md5(long_url).hexdigest()
                    file_hash = hash_file(document_path_update)
                '''if file hash or url hash id different, parse the content '
                of the file'''
                if is_downloaded and long_url_hash != '' and (
                        doc.long_url_hash != long_url_hash
                        or doc.file_hash != file_hash):
                    doc.document_or_long_url_changed = True
                    doc.long_url_hash = long_url_hash
                    doc.file_hash = file_hash
                    document_content = fulltext.get(
                        document_path_update, '< no content >')
                    doc.document_content = document_content
                else:
                    '''delete the updated file. This file is empty'''
                    os.remove(document_path_update)
    try:
        doc.save()
    except Exception as e:
        # print str(e)
        doc.document_content = document_content.decode("latin-1")
        doc.save()
Exemple #39
0
 def test_txt(self):
     self.assertEqual(fulltext.get('files/test.txt'), TEST)
Exemple #40
0
 def test_doc_file(self):
     "Antiword performs wrapping, so we need to allow newlines."
     with open('files/test.doc', 'rb') as f:
         text = fulltext.get(f, backend='doc')
         self.assertEqual(text, TEXT_WITH_NEWLINES)
Exemple #41
0
 def inner(self):
     text = fulltext.get(path, backend=fmt)
     self.assertEqual(text, TEXT)
Exemple #42
0
 def test_zip(self):
     with open('files/test.zip', 'rb') as fo:
         self.assertEqual(fulltext.get(fo), TEST)
Exemple #43
0
 def test_txt(self):
     with open('files/test_enc.txt', 'r') as fo:
         self.assertEqual(fulltext.get(fo), ENC_TEST)
Exemple #44
0
 def test_doc(self):
     with open('files/test_enc.doc', 'rb') as fo:
         self.assertEqual(fulltext.get(fo), ENC_TEST)
Exemple #45
0
 def test_rtf(self):
     self.assertEqual(fulltext.get('files/test.rtf'), TEST)
Exemple #46
0
 def test_ods(self):
     self.assertEqual(fulltext.get(file('files/test.ods', 'r')), TEST)
Exemple #47
0
 def test_xls(self):
     self.assertEqual(fulltext.get('files/test.xls'), TEST)
Exemple #48
0
 def test_pdf(self):
     self.assertEqual(fulltext.get(file('files/test.pdf', 'r')), TEST)
Exemple #49
0
 def test_zip(self):
     self.assertEqual(fulltext.get('files/test.zip'), TEST)
Exemple #50
0
 def test_missing_default(self):
     "Ensures a missing file will return default value instead of exception."
     self.assertEqual(fulltext.get('non-existent-file.pdf', 'canary'), 'canary')
Exemple #51
0
def _handle_open(path):
    with open(path, 'rb') as f:
        return fulltext.get(f)
Exemple #52
0
def download_file(d):
    document_link = DocumentLink.objects.get(pk=d.pk)
    doc, created = Document.objects.get_or_create(document_link=document_link)
    extensions = (
        'doc',
        'pdf',
        'docx',
        'xls',
    )
    document_content = ''

    if d.url:
        '''Define the working Directory and saving Path'''
        wk_dir = os.path.dirname(os.path.realpath('__file__'))
        save_path = wk_dir + "/docstore/"

        '''Unshort URLs and get file name'''
        r = requests.head(d.url, allow_redirects=True)
        if d.url != r.url:
            long_url = r.url
        else:
            long_url = d.url
        doc.long_url = long_url
        local_filename = long_url.split('/')[-1]
        doc.document_name = local_filename

        '''Verify if the the URL is containing a file and authorize download'''
        file_extension = local_filename.split('.')[-1].lower()
        save_name = str(d.pk) + '.' + file_extension
        document_path = save_path + save_name
        is_downloaded = False

        if file_extension in extensions:
            if created or (not created and not doc.is_downloaded):
                doc.url_is_valid = True
                downloader = DownloadFile(long_url, document_path)
                try:
                    is_downloaded = downloader.download()
                    doc.is_downloaded = is_downloaded
                except Exception as e:
                    # print str(e)
                    pass

                '''Get Text from file and save document'''
                if is_downloaded:
                    doc.long_url_hash = hashlib.md5(long_url).hexdigest()
                    doc.file_hash = hash_file(document_path)
                    document_content = fulltext.get(
                        save_path + save_name, '< no content >')
                    doc.document_content = document_content

            if (not created and doc.is_downloaded):
                '''prepare the updated file storage with the new name \
                        <update.timestamp.id.extention'''
                ts = time.time()
                document_path_update = save_path + \
                    "update." + str(ts) + "." + save_name
                downloader = DownloadFile(long_url, document_path_update)
                try:
                    is_downloaded = downloader.download()
                except Exception as e:
                    # print str(e)
                    pass
                '''hash the downloaded file and it long url'''
                if is_downloaded:
                    long_url_hash = hashlib.md5(long_url).hexdigest()
                    file_hash = hash_file(document_path_update)
                '''if file hash or url hash id different, parse the content '
                of the file'''
                if is_downloaded and long_url_hash != '' and (
                        doc.long_url_hash != long_url_hash
                        or doc.file_hash != file_hash):
                    doc.document_or_long_url_changed = True
                    doc.long_url_hash = long_url_hash
                    doc.file_hash = file_hash
                    document_content = fulltext.get(
                        document_path_update, '< no content >')
                    doc.document_content = document_content
                else:
                    '''delete the updated file. This file is empty'''
                    os.remove(document_path_update)
    try:
        doc.save()
    except Exception as e:
        # print str(e)
        doc.document_content = document_content.decode("latin-1")
        doc.save()