def test_handler(self): "Ensures that a handler registered for a given type is executed when that type is converted." def test_handler(f, type): return TEST fulltext.add('application/test', '.test', test_handler) self.assertEqual(fulltext.get('files/test.test'), TEST) self.assertEqual(fulltext.get(file('files/test.test', 'r')), TEST)
def test_unknown_ext(self): # File with unknown extension == use bin backend. fname = self.touch('testfn.unknown') with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname) klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__bin')
def handle_fobj(self, path_or_file): # Avoid circlar imports. from fulltext import get, backend_from_fname, backend_from_fobj f, path = self.get_fobj_and_path(path_or_file) with f: orig_name = orig_fname(path) if _has_ext(orig_name) and splitext(orig_name)[1].lower() != '.gz': backend = backend_from_fname(orig_name) else: backend = backend_from_fobj(f) try: return get(f, backend=backend) except Exception: # Some backends are not able to deal with gzip.GzipFile # instances so we copy the file on # disk. See: https://github.com/btimby/fulltext/issues/56 LOGGER.info( "%r backend could not handle gzip file object directly; " "retrying by extracting the gzip on disk" % backend) f2, _ = self.get_fobj_and_path(path_or_file) ext = splitext(orig_name)[1] with f2: with fobj_to_tempfile(f2, suffix=ext) as fname: return get(fname, backend=backend)
def test_command(self): """Ensures that commands registered for a given type are executed by the `run_command` handler when that type is converted.""" fulltext.add('application/test', '.test', fulltext.run_command, (('echo', TEST), ('echo', TEST), )) self.assertEqual(fulltext.get('files/test.test'), TEST) with open('files/test.test', 'r') as fo: self.assertEqual(fulltext.get(fo), TEST)
def test_src_code_ext(self): fname = "file.js" self.touch(fname, content="foo bar") with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname) klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__text')
def test_text_ext(self): for ext in (".py", ".cpp", ".h", ".pl"): fname = self.touch("document%s" % ext) with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname) klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__text')
def test_by_name_with_no_ext(self): # Assume bin backend is picked up. fname = self.touch("woodstock-no-ext") with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname, name=fname) klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__bin')
def test_by_backend(self): # Assert file ext is ignored if backend opt is used. fname = self.touch('testfn.doc') with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname, backend='html') klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__html')
def test_params(self): # Make sure Backend class receives the right params. fname = self.touch('testfn.doc') with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname, encoding='foo', encoding_errors='bar') klass = m.call_args[0][0] self.assertEqual(klass.encoding, 'foo') self.assertEqual(klass.encoding_errors, 'bar')
def test_name_attr(self): # Make sure that fulltext attempts to determine file name # from "name" attr of the file obj. f = tempfile.NamedTemporaryFile(suffix='.html') with mock.patch('fulltext.handle_fobj', return_value="") as m: fulltext.get(f) klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__html')
def test_html(self): fname = "file-noext" self.touch(fname, content=open( pathjoin(HERE, 'files/test.html'), 'rb').read()) with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname) klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__html')
def test_command(self): """Ensures that commands registered for a given type are executed by the `run_command` handler when that type is converted.""" fulltext.add('application/test', '.test', fulltext.run_command, ( ('echo', TEST), ('echo', TEST), )) self.assertEqual(fulltext.get('files/test.test'), TEST) self.assertEqual(fulltext.get(file('files/test.test', 'r')), TEST)
def test_register_backend_ext(self): fulltext.register_backend( 'application/ijustmadethisup', 'fulltext.backends.__html', extensions=['.ijustmadethisup']) fname = self.touch("document.ijustmadethisup") with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname) klass = m.call_args[0][0] self.assertEqual(klass.__module__, 'fulltext.backends.__html')
def brute_txt(fn): """ Convert anything to txt """ # if url, send there if not os.path.exists(fn): print('! No filename found') return '' # get ext ext=os.path.splitext(fn)[-1][1:] txt='' # epub if ext in {'epub'}: txt=epub2txt(fn) elif ext in {'xml','html','htm'}: with open(fn) as f: content=f.read() txt=xml2txt(content,CONTENT_TAGS[ext]) elif ext in {'txt'}: with open(fn,'rb') as f: content=f.read() return to_unicode(content) elif ext in {'pdf'}: txt=pdf2txt(fn) else: import fulltext txt=fulltext.get(fn) if not txt: return '' # clean txt=txt.replace('\xa0', ' ') if 'project gutenberg ebook' in txt.lower(): txt=clean_gutenberg(txt) return txt
def extract_document_text(self, filename, encoding="iso-8859-13", language="est"): name, extension = os.path.splitext(filename) type = None if not extension in {".doc", ".docx", ".rtf", ".pdf", ".odt"}: type = ("application/msword", None) text = unicode(fulltext.get(filename, type=type), encoding=encoding) if extension == ".pdf" and len(text) == 0: process = subprocess.Popen(("pypdfocr", "-l", language, filename), close_fds=True) process.communicate() ocr_filename = "{}_ocr{}".format(name, extension) if os.path.isfile(ocr_filename): os.rename(ocr_filename, filename) text = unicode(fulltext.get(filename, type=type), encoding=encoding) else: print ("failed to ocr: {}".format(filename)) return text
def load_test(fi) -> tuple: test = fulltext.get(fi).replace('Choose the best word or phrase (a, b, c or d) to fill each blank.', '') if "answer sheet" in test.lower(): test = test[:test.lower().find("answer sheet")] reading = test[test.lower().find("read the text below"):test.find("(21)")] test = test.replace(reading, "") return test, reading
def post(self, request, *args, **kwargs): global questions if questions == []: return Response({"status": "Keys can not be pasted before questions."}, status=status.HTTP_400_BAD_REQUEST) file_serializer = FileSerializer(data=request.data) if file_serializer.is_valid(): fi = file_serializer.save() fi.file.open(mode='rb') with fi.file: answers = fulltext.get(fi.file) for line in answers.split('\n'): if re.match(r'\(\d+\) [a-d]', line): questions[ int(re.findall(r'\d+', line)[0]) - 1 ].answ_correct = re.findall(r'[a-d]', line)[0] TestQuestion.objects.all().delete() for question in questions: TestQuestion.objects.create( number=question.number, text=question.text, answ_correct=question.answ_corr(), answ_option1=question.answers[0], answ_option2=question.answers[1], answ_option3=question.answers[2], answ_option4=question.answers[3], is_reading=question.is_reading, ) return HttpResponseRedirect(base_path.BASE_PATH + 'test_editor/') else: return Response(file_serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def test_global_vars(self): # Make sure the globla vars are taken into consideration and # passed to the underlying backends. encoding, errors = fulltext.ENCODING, fulltext.ENCODING_ERRORS fname = self.touch("file.txt", content=b"hello") try: fulltext.ENCODING = "foo" fulltext.ENCODING_ERRORS = "bar" with mock.patch('fulltext.handle_path', return_value="") as m: fulltext.get(fname) klass = m.call_args[0][0] self.assertEqual(klass.encoding, 'foo') self.assertEqual(klass.encoding_errors, 'bar') finally: fulltext.ENCODING = encoding fulltext.ENCODING_ERRORS = errors
def _handle_text(self, f): """Main body of both 'text mode' tests.""" try: text = fulltext.get(f, mime=self.mime) self.assertMultiLineEqual(self.text, text) finally: f.close()
def upload_file(): if request.method == 'POST': file = request.files['file'] if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) str_data = [] t = database_files() for k in t: text_from_file = "" try: text_from_file = fulltext.get( 'uploads/' + str(k), None).replace('\n', ' ').replace('\"', "").replace("\'", "") except: continue cosine_freq = cosine.cosinedatabaseTF(text_from_file) shingles = [] for i in range(1, 4): shingle = "{" + shinglmethods.genshingle_n( text_from_file, i) + "}" shingles.append(shingle) shingle_t = "\"shingles_by_id\": [" + ",".join(shingles) + "]" shingles_sorted = [] for i in range(1, 4): shingle_sorted = "{" + shinglmethods.genshingle_n( text_from_file, i) + "}" shingles_sorted.append(shingle_sorted) shingle_t_sorted = "\"shingles_sorted_by_id\": [" + ",".join( shingles) + "]" moodles = [] for i in range(1, 4): moodle = "{" + moodlemethod.genmoodle_n(text_from_file, i) + "}" moodles.append(moodle) moodle_t = "\"moodles_by_id\": [" + ",".join(moodles) + "]" filepath = "txt/" + str(k).replace("doc", "").replace( "docx", "").replace("txt", "") + "txt" hs = open(filepath, "w") str_data.append( "{" + "\"name\":\"{}\", \"filepath\":\"{}\", {}, {}, {}, {}". format(str(k), filepath, cosine_freq, shingle_t, shingle_t_sorted, moodle_t) + "}") hs.write(text_from_file) hs.close() text_for_json = ",".join(str_data) hj = open("database.json", "w") hj.write("[" + text_for_json + "]") hj.close() return redirect(url_for('uploaded_file', filename=filename)) return '''
def test_text_strip(self): """Ensure that stripping works as expected.""" file = BytesIO() file.write(b' Test leading and trailing spaces removal. ') file.write(b'Test @$%* punctuation removal! ') file.write(b'Test spaces removal! ') file.seek(0) stripped = fulltext.get(file, backend='bin') self.assertMultiLineEqual('Test leading and trailing spaces removal. ' 'Test punctuation removal! Test spaces ' 'removal!', stripped)
def upload_file_for_check(): if request.method == 'POST': file = request.files['file'] if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join('/tmp', filename)) text = fulltext.get('/tmp/' + filename) #return text return render_template('index.html', query=text) return '''
def test_callbacks(self): # Make sure callback methods are called (also in the right order). flags = [] class Backend: def setup(self): flags.append("setup") def teardown(self): flags.append("teardown") def handle_fobj(self, path): flags.append("handle_fobj") return "text" fname = self.touch('testfn.doc') with mock.patch('fulltext.backend_inst_from_mod', return_value=Backend()): fulltext.get(fname, encoding='foo', encoding_errors='bar') self.assertEqual(flags, ['setup', 'handle_fobj', 'teardown'])
def test_teardown_on_err(self): # Make sure teardown methods is called also on error. flags = [] class Backend: def setup(self): flags.append("setup") def teardown(self): flags.append("teardown") def handle_fobj(self, path): 1 / 0 fname = self.touch('testfn.doc') with mock.patch('fulltext.backend_inst_from_mod', return_value=Backend()): with self.assertRaises(ZeroDivisionError): fulltext.get(fname, encoding='foo', encoding_errors='bar') self.assertEqual(flags, ['setup', 'teardown'])
def handle_fobj(self, f): from fulltext import get # avoid circular import with ExitStack() as stack: text = StringIO() archive = stack.enter_context(rarfile.RarFile(f)) for f in archive.infolist(): LOGGER.debug("extracting %s" % f.filename) rf = stack.enter_context(archive.open(f)) ret = get(rf, name=f.filename, encoding=self.encoding, encoding_errors=self.encoding_errors) text.write(ret) return text.getvalue()
def handle_fobj(self, f): from fulltext import get # avoid circular import with ExitStack() as stack: text = StringIO() z = stack.enter_context(zipfile.ZipFile(f, 'r')) for name in sorted(z.namelist()): LOGGER.debug("extracting %s" % name) zf = stack.enter_context(z.open(name, 'r')) # Kinda hacky, but zipfile's open() does not handle "b" in # the mode. # We do this here to satisy an assertion in handle_fobj(). zf.mode += 'b' text.write(get(zf, name=name)) return text.getvalue()
def scanDoc(self, path): text = self.remove_non_ascii(fulltext.get(path)) # Oke so maybe the pdf was actually an image if text == "": print( 'Maybe pdf contained only images, trying to get text from image' ) text = self.get_image_content(path) tokens = word_tokenize(text) punctuations = ['(', ')', ';', ':', '[', ']', ',', '.', "'", '@', '&'] stop_words = stopwords.words('dutch') keywords = [ word for word in tokens if not word in stop_words and not word in punctuations ] return " ".join(keywords)
def test_invalid_char(self): fname = pathjoin(HERE, "files/unicode/invalid.%s" % self.ext) if os.path.exists(fname): with self.assertRaises(UnicodeDecodeError): fulltext.get(fname) ret = fulltext.get(fname, encoding_errors="ignore") self.assertEqual(ret, self.invalid) # fname = pathjoin(HERE, "files/unicode/it.%s" % self.ext) with self.assertRaises(UnicodeDecodeError): fulltext.get(fname, encoding='ascii') ret = fulltext.get( fname, encoding='ascii', encoding_errors="ignore") against = self.italian.replace( u"àèìòù", u"").replace(u" ", u" ").strip() self.assertEqual(ret, against)
def test_doc(self): self.assertEqual(fulltext.get('files/test.doc'), TEST)
def _handle_open(path): with open(path, 'rb') as f: return fulltext.get(f)
def test_default_none(self): "Ensures None is a valid value to pass as default." self.assertEqual(fulltext.get('unknown-file.foobar', None), None)
def test_unknown_default(self): "Ensures an unknown file type will return default value instead of exception." self.assertEqual(fulltext.get('unknown-file.foobar', 'canary'), 'canary')
def main(): ############################################################################### # minu aadress ############################################################################### proj = Proj(init="epsg:3301") # L-EST97 projektsioon # VAIKEKOHT,TANAV,KATASTRIYKSUS,EHITISHOONE querystring = { "dogis_link": "getgazetteer", "features": "EHITISHOONE", "results": "5" } address = raw_input("Sisesta otsitav aadress (või vajuta ENTER): ") if not address: # 25°24'14.586"E 59°29'9.822"N # 579555,6595094 address = "Lõuna tee 15, Mäepea küla" # Maa-ameti teenus aadressiinfo (sealhulgas koordinaatide) saamiseks # http://geoportaal.maaamet.ee/est/Teenused/X-GIS-JSON-aadressiotsingu-teenuse-kirjeldus-p502.html querystring["address"] = address response = requests.request('GET', 'http://xgis.maaamet.ee/xGIS/XGis', params=querystring) addresses = defaultdict(list) try: json = response.json()["featureMember"] if isinstance(json, dict): get_address(json, addresses, proj) elif isinstance(json, list): for member in json: get_address(member, addresses, proj) except (ValueError, KeyError): print "ei leitud", response.status_code response.close() tunnus, address, lest97, geo = addresses["EHITISHOONE"][0] print "=======================================================================" print "Leitud aadress:" print address print "Koordinaadid:" pprint(geo) # geograafilised koordinaadid pprint(lest97) # projekteeritud koordinaadid print "=======================================================================" raw_input("(vajuta ENTER):") ############################################################################### # amphora teemad # Planeerimine ja ehitus - 5059 # Detailplaneeringute algatamine - 50285 # Detailplaneeringute kehtestamine - 50286 # Detailplaneeringute vastuvõtmine - 50287 # Projekteerimistingimuste määramine - 50288 # Maakorraldus – 50344 ############################################################################### # kuusalu valla dokumendiregister url = "http://server.amphora.ee/atp/kuusaluvv/AmphoraPublic.asmx" headers = { "content-type": "application/x-www-form-urlencoded" } ############################################################################### payload = { "type": "DOCUMENT", "topicID": "5059", "maxRows": "20", "unitID": "", "folderID": "", "formID": "", "phrase": "", "startRowIndex": "", "detailMetadata": "" } # dokumentide nimekirja päring registrist articles = dict() response = requests.post(url + "/GetItemList", data=payload, headers=headers, stream=True) response.raw.decode_content = True # 259449 ja 259430 on suured dokumendid, need jäetakse demo mõttes hetkel välja for event, element in et.iterparse(response.raw): if get_element_tag(element.tag) == "sys_id" and element.text != "259449" and element.text != "259430": articles[element.text] = dict() element.clear() response.close() ############################################################################### # dokumendid ############################################################################### articles_folder = os.path.dirname(os.path.abspath(__file__)) + "/documents/" if os.path.exists(articles_folder): shutil.rmtree(articles_folder) os.makedirs(articles_folder) # üksikute dokumentide metadata ja faili päring progress = tqdm(articles) for key in progress: progress.set_description("Dokumendid %s" % key) payload = { "id": key, "maxDepth": "0" } # dokumendi päring response = requests.request("POST", url + "/GetItem", data=payload, headers=headers, stream=True) response.raw.decode_content = True path = deque() content = None filename = None filetype = None for event, element in et.iterparse(response.raw, events=("start", "end")): element_tag = get_element_tag(element.tag) if event == "start": path.append(element_tag) elif event == "end": if "file" in path: if element_tag == "data": content = base64.decodestring(element.text or "") elif element_tag == "filename": filename = element.text elif element_tag == "type": filetype = element.text if element_tag == "field" and "name" in element.attrib and element.attrib["name"] == "Caption": articles[key]["title"] = element.text path.pop() element.clear() if content is not None and filename is not None and filetype == "MAIN_FILE": _, extension = os.path.splitext(filename) articles[key]["file"] = key + extension out = open(articles_folder + key + extension, "wb") # faili salvestamine out.write(content) out.close() break else: print "ei sisalda faili", key response.close() print "=======================================================================" ############################################################################### # failid ############################################################################### # Maa-ameti teenus koordinaatide pärimiseks katastrinumbri järgi # http://geoportaal.maaamet.ee/est/Teenused/Poordumine-kaardirakendusse-labi-URLi-p9.html#a13 url = "http://geoportaal.maaamet.ee/url/xgis-ky.php" querystring = { "what": "tsentroid", "out": "json" } # kauguse arvutus projekteeritud koordinaatidega point = lambda coordinate: float(coordinate) distance = lambda src, dest: math.sqrt( (point(src[0]) - point(dest[0])) ** 2 + (point(src[1]) - point(dest[1])) ** 2) pattern = re.compile("\d{5}:\d{3}:\d{4}") # regexp katastrinumbri leindmiseks tekstist _, _, x_y, _ = addresses["EHITISHOONE"][0] # minu aadress progress = tqdm(articles) for key in progress: progress.set_description("Koordinaadid %s" % key) articles[key]["katastrinumbrid"] = list() if "file" not in articles[key]: continue text = fulltext.get(articles_folder + articles[key]["file"]) # pdf, doc ja rtf failide konverteerimine tekstiks katastrinumbrid = set(pattern.findall(text)) for number in katastrinumbrid: querystring["ky"] = number response = requests.request("GET", url, params=querystring) # koordinaatide päring maaametist try: json = response.json()["1"] distance_km = distance(x_y, (json["X"], json["Y"])) / 1000 # katastrinumbri kaugus minu aadressist longitude, latitude = proj(json["X"], json["Y"], inverse=True) # katastrinumbri geograafilised koordinaadid (saab otse kopeerida google mapsi) articles[key]["katastrinumbrid"].append( (number, (json["X"], json["Y"]), distance_km, (latitude, longitude))) except (ValueError, KeyError): print "koordinaate ei leitud", response.status_code, key, number response.close() print "=======================================================================" for key in articles: if articles[key]["katastrinumbrid"]: print "---------------" print "Dokument:", key, articles[key]["title"] print "---------------" for number in articles[key]["katastrinumbrid"]: knumber, lest97, kaugus, (latitude, longitude) = number print knumber, " koordinaadid:", latitude, ",", longitude, " kaugus minu aadressist:", kaugus print "======================================================================="
def download_file(d): document_link = DocumentLink.objects.get(pk=d.pk) doc, created = Document.objects.get_or_create(document_link=document_link) extensions = ( 'doc', 'pdf', 'docx', 'xls', ) document_content = '' if d.url: '''Define the working Directory and saving Path''' wk_dir = os.path.dirname(os.path.realpath('__file__')) save_path = wk_dir + "/docstore/" '''Unshort URLs and get file name''' r = requests.head(d.url, allow_redirects=True) if d.url != r.url: long_url = r.url else: long_url = d.url doc.long_url = long_url local_filename = long_url.split('/')[-1] doc.document_name = local_filename '''Verify if the the URL is containing a file and authorize download''' file_extension = local_filename.split('.')[-1].lower() save_name = str(d.pk) + '.' + file_extension document_path = save_path + save_name is_downloaded = False if file_extension in extensions: if created or (not created and not doc.is_downloaded): doc.url_is_valid = True downloader = DownloadFile(long_url, document_path) try: is_downloaded = downloader.download() doc.is_downloaded = is_downloaded except Exception as e: # print str(e) pass '''Get Text from file and save document''' if is_downloaded: doc.long_url_hash = hashlib.md5(long_url).hexdigest() doc.file_hash = hash_file(document_path) document_content = fulltext.get( save_path + save_name, '< no content >') doc.document_content = document_content if (not created and doc.is_downloaded): '''prepare the updated file storage with the new name \ <update.timestamp.id.extention''' ts = time.time() document_path_update = save_path + "update." + str(ts) + "." + save_name # NOQA: E501 downloader = DownloadFile(long_url, document_path_update) try: is_downloaded = downloader.download() except Exception as e: # print str(e) pass '''hash the downloaded file and it long url''' if is_downloaded: long_url_hash = hashlib.md5(long_url).hexdigest() file_hash = hash_file(document_path_update) '''if file hash or url hash id different, parse the content ' of the file''' if is_downloaded and long_url_hash != '' and ( doc.long_url_hash != long_url_hash or doc.file_hash != file_hash): doc.document_or_long_url_changed = True doc.long_url_hash = long_url_hash doc.file_hash = file_hash document_content = fulltext.get( document_path_update, '< no content >') doc.document_content = document_content else: '''delete the updated file. This file is empty''' os.remove(document_path_update) try: doc.save() except Exception as e: # print str(e) doc.document_content = document_content.decode("latin-1") doc.save()
def test_txt(self): self.assertEqual(fulltext.get('files/test.txt'), TEST)
def test_doc_file(self): "Antiword performs wrapping, so we need to allow newlines." with open('files/test.doc', 'rb') as f: text = fulltext.get(f, backend='doc') self.assertEqual(text, TEXT_WITH_NEWLINES)
def inner(self): text = fulltext.get(path, backend=fmt) self.assertEqual(text, TEXT)
def test_zip(self): with open('files/test.zip', 'rb') as fo: self.assertEqual(fulltext.get(fo), TEST)
def test_txt(self): with open('files/test_enc.txt', 'r') as fo: self.assertEqual(fulltext.get(fo), ENC_TEST)
def test_doc(self): with open('files/test_enc.doc', 'rb') as fo: self.assertEqual(fulltext.get(fo), ENC_TEST)
def test_rtf(self): self.assertEqual(fulltext.get('files/test.rtf'), TEST)
def test_ods(self): self.assertEqual(fulltext.get(file('files/test.ods', 'r')), TEST)
def test_xls(self): self.assertEqual(fulltext.get('files/test.xls'), TEST)
def test_pdf(self): self.assertEqual(fulltext.get(file('files/test.pdf', 'r')), TEST)
def test_zip(self): self.assertEqual(fulltext.get('files/test.zip'), TEST)
def test_missing_default(self): "Ensures a missing file will return default value instead of exception." self.assertEqual(fulltext.get('non-existent-file.pdf', 'canary'), 'canary')
def download_file(d): document_link = DocumentLink.objects.get(pk=d.pk) doc, created = Document.objects.get_or_create(document_link=document_link) extensions = ( 'doc', 'pdf', 'docx', 'xls', ) document_content = '' if d.url: '''Define the working Directory and saving Path''' wk_dir = os.path.dirname(os.path.realpath('__file__')) save_path = wk_dir + "/docstore/" '''Unshort URLs and get file name''' r = requests.head(d.url, allow_redirects=True) if d.url != r.url: long_url = r.url else: long_url = d.url doc.long_url = long_url local_filename = long_url.split('/')[-1] doc.document_name = local_filename '''Verify if the the URL is containing a file and authorize download''' file_extension = local_filename.split('.')[-1].lower() save_name = str(d.pk) + '.' + file_extension document_path = save_path + save_name is_downloaded = False if file_extension in extensions: if created or (not created and not doc.is_downloaded): doc.url_is_valid = True downloader = DownloadFile(long_url, document_path) try: is_downloaded = downloader.download() doc.is_downloaded = is_downloaded except Exception as e: # print str(e) pass '''Get Text from file and save document''' if is_downloaded: doc.long_url_hash = hashlib.md5(long_url).hexdigest() doc.file_hash = hash_file(document_path) document_content = fulltext.get( save_path + save_name, '< no content >') doc.document_content = document_content if (not created and doc.is_downloaded): '''prepare the updated file storage with the new name \ <update.timestamp.id.extention''' ts = time.time() document_path_update = save_path + \ "update." + str(ts) + "." + save_name downloader = DownloadFile(long_url, document_path_update) try: is_downloaded = downloader.download() except Exception as e: # print str(e) pass '''hash the downloaded file and it long url''' if is_downloaded: long_url_hash = hashlib.md5(long_url).hexdigest() file_hash = hash_file(document_path_update) '''if file hash or url hash id different, parse the content ' of the file''' if is_downloaded and long_url_hash != '' and ( doc.long_url_hash != long_url_hash or doc.file_hash != file_hash): doc.document_or_long_url_changed = True doc.long_url_hash = long_url_hash doc.file_hash = file_hash document_content = fulltext.get( document_path_update, '< no content >') doc.document_content = document_content else: '''delete the updated file. This file is empty''' os.remove(document_path_update) try: doc.save() except Exception as e: # print str(e) doc.document_content = document_content.decode("latin-1") doc.save()