def computeScores(inputDir, outCSV, acceptTypes): with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate", "y-coordinate", "Similarity_score"]) files_tuple = itertools.combinations( filterFiles(inputDir, acceptTypes), 2) # for file1, file2 in files_tuple: for file1 in files_tuple: try: # row_cosine_distance = [file1, file2] arr = [] with open(file1, 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: arr.append(row) # print ','.join(row) # row_cosine_distance = [file1] file1_parsedData = parser.from_buffer(arr[0]) file2_parsedData = parser.from_buffer(arr[1]) # file2_parsedData = parser.from_file(file2) v1 = Vector(file1, file1_parsedData["content"]) v2 = Vector(file1, file2_parsedData["content"]) row_cosine_distance.append(v1.cosTheta(v2)) a.writerow(row_cosine_distance) except ConnectionError: sleep(1) except KeyError: continue
def get_content(r, headers, requestOptions=None): """Format downloaded pdf content.""" f = BytesIO(r.content) if requestOptions is None: raw = parser.from_buffer(f, headers=headers) else: raw = parser.from_buffer(f, headers=headers, requestOptions=requestOptions) f.close() return raw
def esIndex(ccaDir, team, crawler, index, docType, url=None, outPath=None, storeprefix=None): if not url and not outPath: raise Exception("Either Elastic Url or output path must be specified.") ccaJsonList = list_files(ccaDir) print "Processing ["+str(len(ccaJsonList))+"] files." procCount = 0 failedList=[] failedReasons=[] CDRVersion = 2.0 outFile = codecs.open(outPath, 'w', 'utf-8') if outPath else None for f in ccaJsonList: with open(f, 'r') as fd: try: newDoc = {} c = fd.read() # fix for no request body out of Nutch CCA c.replace("\"body\" : null", "\"body\" : \"null\"") ccaDoc = json.loads(cbor.loads(c).value, encoding='utf8') newDoc["url"] = ccaDoc["url"] newDoc["timestamp"] = ccaDoc["imported"] newDoc["team"] = team newDoc["crawler"] = crawler newDoc["raw_content"] = ccaDoc["response"]["body"] newDoc["content_type"] = getContentType(ccaDoc) parsed = parser.from_buffer(newDoc["raw_content"].encode("utf-8")) newDoc["crawl_data"] = {} if "content" in parsed: newDoc["crawl_data"]["content"] = parsed["content"] newDoc["extracted_text"] = parsed["content"] # CDR version 2.0 additions newDoc["_id"] = ccaDoc["key"] newDoc["obj_original_url"] = ccaDoc["url"] # newDoc["obj_parent"] = ??? Missing # TODO: get this field some how! newDoc["obj_stored_url"] = url_to_nutch_dump_path(ccaDoc["url"], prefix=storeprefix) newDoc["extracted_metadata"] = parsed["metadata"] if 'metadata' in parsed else {} newDoc["version"] = CDRVersion verboseLog("Indexing ["+f+"] to Elasticsearch.") if url: indexDoc(url, newDoc, index, docType) if outFile: outFile.write(json.dumps(newDoc)) outFile.write("\n") procCount += 1 except Exception as err: failedList.append(f) failedReasons.append(str(err)) traceback.print_exc() if outFile: print("Output Stored at %s" % outPath) outFile.close() print "Processed " + str(procCount) + " CBOR files successfully." print "Failed files: " + str(len(failedList)) if _verbose: for i in range(len(failedList)): verboseLog("File: "+failedList[i]+" failed because "+failedReasons[i])
def textExtractor(URL): # Default timeout 10 seconds ok = False text = "" try: text = ur.urlopen(URL).read() except Exception as e: print("URL open Failed:", e) if len(text) == 0: try: r = requests.get(URL, headers={'User-Agent': 'Mozilla/5.0'}) if r.status_code == 200: text = r.text else: return ok, "" except Exception as e: print("requests open Failed:", e) visible_text = "" if USE_TIKA: from tika import parser visible_text = parser.from_buffer(text)['content'] else: soup = BeautifulSoup(text, "html.parser") [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] visible_text = soup.getText() ok = True return ok, [visible_text.encode('ascii', 'replace').decode('ascii').replace('\n','.')]
def extract(self, request: ExtractorRequest = None) -> ExtractorResponse: try: # request options to the document url and the tika server where we send the bytes to # see: https://requests.kennethreitz.org/en/master/api/#requests.request tika_req_options = {"timeout": 15} document_req_options = {"timeout": 15, "allow_redirects": True} if request.url: resp = requests.get(request.url, **document_req_options) buffer = BytesIO(resp.content) parsed = parser.from_buffer(buffer, requestOptions=tika_req_options) else: parsed = parser.from_file(request.filename, requestOptions=tika_req_options) text = parsed["content"] or "" # merge parsed metadata with source info meta = {**parsed["metadata"], **{"source": "document"}} # construct response response = ExtractorResponse(meta=meta, text=text) except Exception as e: msg = f"Error extracting text via Tika extractor: '{str(e)}'" log.error(msg) response = ExtractorResponse(error=msg) return response
def craft(self, uri: str, buffer: bytes, *args, **kwargs): from tika import parser headers = { 'X-Tika-PDFOcrStrategy': self.tika_ocr_strategy, 'X-Tika-PDFextractInlineImages': self.tika_extract_inline_images, 'X-Tika-OCRLanguage': self.tika_ocr_language } request_options = {'timeout': self.tika_request_timeout} text = "" if buffer: result = parser.from_buffer(string=buffer, serverEndpoint=TIKA_URL, xmlContent=False, headers=headers, requestOptions=request_options) elif uri: result = parser.from_file(filename=uri, serverEndpoint=TIKA_URL, service='all', xmlContent=False, headers=headers, requestOptions=request_options) else: raise ValueError('No value found in "buffer" and "uri"') if 'status' in result: if result['status'] == 200: text = result['content'] return dict(text=text)
def parallel_requests_test(): alec_bills = [ json.loads(x) for x in open( "{0}/model_legislation/alec_bills.json".format(DATA_PATH)) ] test_queries = [base64.b64decode(s['source']) for s in alec_bills] pattern = re.compile("[0-9]\.\s.*") for i, t in enumerate(test_queries): test_queries[i] = tp.from_buffer(t)['content'] test_queries[i] = " ".join(re.findall(pattern, test_queries[i])) #test_queries[i] = test_queries[i].split() #test_queries[i] = " ".join(test_queries[i][0:200]) test_queries = test_queries[0:100] ec = ElasticConnection() serial_time = time.time() for test_query in test_queries: ec.similar_doc_query(test_query) print("serial time: ", time.time() - serial_time) pool = Pool(processes=7) parallel_time = time.time() pool.map(parallel_query, test_queries) print("parallel time: ", time.time() - parallel_time) exit()
def extract_model_legislation(json_file, encoded): ''' Keyword Args: json_file: corresponds to json file with model legislation encoded: True/False if json file is b64 encoded returns: dictionary with url, date, and text of model legislation decription: extract text from model legislation ''' data = [] with open(json_file) as f: for line in f: data.append(json.loads(line)) model_legislation = {} for i in range(len(data)): model_legislation[i] = data[i] if encoded == True: for i in range(len(model_legislation)): try: ml = model_legislation[i]['source'] ml = base64.b64decode(ml) ml = tp.from_buffer(ml) model_legislation[i]['source'] = ml['content'] except AttributeError: model_legislation[i]['source'] = None return model_legislation else: return model_legislation
def process(): uploaded_doc = request.files.get('document') if uploaded_doc is None: return 'Kolom dokumen harus diisi.' name, ext = os.path.splitext(uploaded_doc.filename) if ext not in ('.pdf'): return 'Berkas dokumen wajib bertipe PDF.' input_text = parser.from_buffer(uploaded_doc.file)["content"] algorithm_identifiers = request.forms.get("algorithm").split("-") algorithm_id = algorithm_identifiers[0] algorithm_fold = algorithm_identifiers[1] model = joblib.load(get_model_file_name(algorithm_id, algorithm_fold)) vectorizer = joblib.load(get_vectorizer_file_name(algorithm_fold)) answer = model.predict(vectorizer.transform([input_text]).toarray()) response.set_cookie( "message", "Dokumen {} termasuk klasifikasi {}".format( uploaded_doc.filename, answer[0], )) return redirect("/")
def query_time_speed_test(): from tika import parser as tp import re import numpy as np alec_bills = [ json.loads(x) for x in open( "{0}/model_legislation/alec_bills.json".format(DATA_PATH)) ] test_queries = [base64.b64decode(s['source']) for s in alec_bills] pattern = re.compile("[0-9]\.\s.*") for i, t in enumerate(test_queries): test_queries[i] = tp.from_buffer(t)['content'] test_queries[i] = " ".join(re.findall(pattern, test_queries[i])) test_queries[i] = test_queries[i].split() test_queries = [x for x in test_queries if len(x) >= 1500] query_sizes = np.arange(50, 1050, 50) ec = ElasticConnection() avg_times = [] for query_size in query_sizes: temp_times = [] for query in test_queries: query = " ".join(query[0:query_size]) t1 = time.time() ec.similar_doc_query(query, num_results=1000) temp_times.append(time.time() - t1) avg_times.append(np.mean(temp_times)) print("query size {0} , avg time (s) {1}".format( query_size, np.mean(temp_times))) for i in avg_times: print(i)
def extract_text_from_buffer(buffer, exclude_commons=False): buffer.seek(0) raw = parser.from_buffer(buffer) content = raw['content'].split('\n') content = [row for row in content if row != ""] return content
def handle_dirty_docs(self): dirty_docs = self.docstore.select( butter_user_id=self.butter_user_id, datasource_user_id=self.datasource_user_id, dirty=True) for doc in dirty_docs: result = self.driver.retrieve_data(doc) if result.data or result.unicode_data: doc['content'] = parser.from_buffer(result.data) doc['dirty'] = False self.docstore.update_raw([doc]) update_docs = [doc] + result.docs for d in [d for d in result.docs if 'butter_user_id' not in d]: d['butter_user_id'] = doc['butter_user_id'] d['datasource_user_id'] = doc['datasource_user_id'] self.docstore.update_raw(update_docs) if result.should_remove_doc: self.docstore.delete(doc) if result.should_remove_children: # handle any docs to be deleted self.remove_child_docs(doc, self.butter_user_id, self.datasource_user_id, [d['id'] for d in result.docs])
def remove_header_footer(xml_data): # Function to remove header and footer from the text data xhtml_data = BeautifulSoup(xml_data['content']) page0 = None all_pages = "" total_pages = 0 all_pages_enum = xhtml_data.find_all('div', attrs={'class': 'page'}) if len(all_pages_enum) <= 0: return xhtml_data.getText() total_pages = len(all_pages_enum) page_to_select = int(round(total_pages / 2)) for page, content in enumerate(all_pages_enum): _buffer = StringIO() _buffer.write(str(content)) parsed_content = parser.from_buffer(_buffer.getvalue()) all_pages = all_pages + parsed_content['content'].strip() if page == page_to_select: page0 = parsed_content['content'] first_page_split = page0.split("\n") for page_str in first_page_split: if len(page_str.rstrip()) > 0: page_str = re.escape(page_str) page_str = re.sub(r'\d+', '\\\\d+', page_str) list_of_instances = [ m.start() for m in re.finditer(page_str + "[\r\n]", all_pages) ] if len(list_of_instances) > round(total_pages / 2): all_pages = re.sub(page_str + "[\r\n]", '', all_pages) return all_pages
def get_task(): if 'file' not in request.files: abort(400) file = request.files['file'] if file.filename == '': abort(400) if file and allowed_file(file.filename): try: text = parser.from_buffer(file.read()) except Exception as e: return (str(e)) result = { "lang": detect(text['content']), "text": text['content'].replace('\n', '').replace('\t', ''), "size": True } if (text['content'].split(" ").__len__() > 50000): logging.error("Document length exceeded limit. No. of chars: ") logging.error(text['content'].split(" ").__len__()) result["size"] = False return Response(json.dumps(result), mimetype='application/json') abort(400) return 'Something went wrong, try again!'
def esIndex(ccaDir, team, crawler, url, index, docType): ccaJsonList = list_files(ccaDir) print "Processing ["+str(len(ccaJsonList))+"] files." procList=[] failedList=[] failedReasons=[] for f in ccaJsonList: ccaDoc = None newDoc = {} with open(f, 'r') as fd: try: c = fd.read() # fix for no request body out of Nutch CCA c.replace("\"body\" : null", "\"body\" : \"null\"") ccaDoc = json.loads(cbor.loads(c), encoding='utf8') newDoc["url"] = ccaDoc["url"] newDoc["timestamp"] = ccaDoc["imported"] newDoc["team"] = team newDoc["crawler"] = crawler newDoc["raw_content"] = ccaDoc["response"]["body"] newDoc["content_type"] = getContentType(ccaDoc) parsed = parser.from_buffer(newDoc["raw_content"].encode("utf-8")) newDoc["crawl_data"] = {} newDoc["crawl_data"]["content"] = parsed["content"] verboseLog("Indexing ["+f+"] to Elasticsearch.") indexDoc(url, newDoc, index, docType) procList.append(f) except ValueError, err: failedList.append(f) failedReasons.append(str(err))
def parse_and_enqueue(buffer: bytes, aptnote: dict, parser, queue: Queue, condition: Condition) -> None: parsed_buffer: dict = parser.from_buffer(buffer) augmented_aptnote: dict = augment_aptnote(aptnote, parsed_buffer) with condition: queue.put(augmented_aptnote) condition.notify()
def getImageFeatures(self, params): try: import cv2 import numpy as np cv2_available = True except ImportError: cv2_available = False if 'url' in params: data = requests.get(params['url'], verify=False).content else: data = str(cherrypy.request.body.read()) # Run Tika once parsed = parser.from_buffer(data) tika = {} for (k, v) in parsed["metadata"].iteritems(): k = k.lower().replace(':', '_').replace(' ', '_').replace('-', '_') tika[k] = v tika['content'] = parsed["content"] if cv2_available: file_bytes = np.asarray(bytearray(data), dtype=np.uint8) image = cv2.imdecode(file_bytes, flags=cv2.CV_LOAD_IMAGE_UNCHANGED); if image is not None: if len(image.shape) < 3 or image.shape[2] == 1: image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) v = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) v = v.flatten() hist = v / sum(v) tika['histogram'] = hist.tolist() return tika
def review_pdf(urls): for url in urls: pdf_obj = requests.get(url, stream=True) raw_xml = parser.from_buffer(pdf_obj.content, xmlContent=True) # raw_xml = parser.from_file(file, xmlContent=True) body = raw_xml['content'].split('<body>')[1].split('</body>')[0] #print(body) body_without_tag = body.replace("<p>", "").replace("</p>", "").replace("<div>", "").replace("</div>","").replace("<p />","") text_pages = body_without_tag.split("""<div class="page">""")[1:] num_pages = len(text_pages) if num_pages==int(raw_xml['metadata']['xmpTPg:NPages']) : #check if it worked correctly print(num_pages) pdf_lines = [] for page_num in range(num_pages): if re.search("Review of Performance", text_pages[page_num], re.IGNORECASE): skipHeadings = True lines = text_pages[page_num].split("\n") for line in lines: if skipHeadings: if re.search("Review of Performance", line, re.IGNORECASE): skipHeadings = False if skipHeadings: continue print(line) pdf_lines.append(line) return pdf_lines
def fetch(self, no): logger.debug(f'Fetching Actualización nº {no}') r = requests.get( f'https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/' f'alertasActual/nCov-China/documentos/Actualizacion_{no}_COVID-19.pdf' ) return parser.from_buffer(r.content) if r.status_code == 200 else None
def pdf_to_txt(full_path): """Turns all text from pdf to raw string full_path: string, full path to pdf file to convert """ file = open(full_path, 'rb') extracted_text = parser.from_buffer(file) return extracted_text['content']
def pdf2text(raw): text = None try: content = parser.from_buffer(raw) if content['status'] == 200: text = content['content'] except: text = None return text
def _is_whitepaper(self, pdf, company): raw_pdf = parser.from_buffer(pdf) raw_text = raw_pdf['content'] if raw_text and company.lower() in raw_text.lower() and ( 'whitepaper' in raw_text.lower() or 'white paper' in raw_text.lower()): return True else: return False
def findFrequencyDistribution(self, request, context): idDocument = request.document.idDocument file = request.document.file totalPalavras = request.total # Reading document parsed = parser.from_buffer(file) content = parsed['content'] return extractor.generate_frequency_distribution(id=idDocument, content=content, total=totalPalavras)
def grab_more_eval_bills(): with open('../../data/evaluation_set/bills_for_evaluation_set.csv') as f: bills_list = [row for row in csv.reader(f.read().splitlines())] bill_ids_list = [] url_lists = [] topic_list = [] for i in range(len(bills_list)): state = bills_list[i][1] if state == 'ct': continue topic = bills_list[i][0] bill_number = bills_list[i][2] bill_number = re.sub(' ', '', bill_number) year = bills_list[i][3] url = bills_list[i][6] unique_id = str(state + '_' + year + '_' + bill_number) topic_list.append(topic) bill_ids_list.append(unique_id) url_lists.append(url) bills_ids = zip(bill_ids_list, url_lists) bad_count = 0 bills_text = [] state_list = [] for i in range(len(bills_ids)): try: bill_text = get_bill_by_id(bills_ids[i][0]) except IndexError: try: url = bills_ids[i][1] doc = urllib.urlopen(url).read() bill_text = parser.from_buffer(doc)['content'] print url except IOError: bad_count += 1 print 'bad_count: ', bad_count #skip this case continue bills_text.append(bill_text) state = bills_ids[i][0][0:2] state_list.append(state) bills_state = zip(bills_text, state_list, topic_list) bill_type_1 = [] bill_type_2 = [] for bill in bills_state: if bill[-1] == 'Adult Guardianship and Protective Proceedings Jurisdiction Act': bill_type_1.append((bill[0],bill[1])) else: bill_type_2.append((bill[0],bill[1])) return [bill_type_2, bill_type_1]
def create_bills(ls): ''' args: ls: list of lists of urls that correspond to matches returns: dictionary grouped by matches ''' k = 0 bill_id = 0 bills = {} bad_count = 0 for urls in ls: for url,state in urls: try: print "bill_id: " + str(bill_id) bills[bill_id] = {} doc = urllib2.urlopen(url).read() text = parser.from_buffer(doc)['content'] bills[bill_id]['url'] = url bills[bill_id]['text'] = text bills[bill_id]['match'] = k bills[bill_id]['state'] = state except: pass bad_count += 1 print 'bad_count: ', bad_count bill_id += 1 k += 1 #get more evaluation bills eval_bills = grab_more_eval_bills() for more_bills in eval_bills: print 'bill_group: ' k k +=1 for text, state in more_bills: bill_id += 1 print 'bill_id: ', i bills[bill_id] = {} bills[bill_id]['text'] = text bills[bill_id]['state'] = state bills[bill_id]['match'] = k try: for bill in bills.keys(): if bills[bill] == {} or bills[bill]['text'] == '' \ or bills[bill]['text'] == None: del bills[bill] except: pass return bills
def tika_read(pdf_file_path): pages, _buffer = [], StringIO() data = parser.from_file(pdf_file_path, xmlContent=True) xhtml_data = BeautifulSoup(data['content'], features="lxml") for content in xhtml_data.find_all('div', attrs={'class': 'page'}): _buffer.write(str(content)) parsed_content = parser.from_buffer(_buffer.getvalue()) _buffer.truncate() pages.append(parsed_content['content']) return pages
def esIndexDoc(f, team, crawler, index, docType, failedList, failedReasons, procCount, url=None, outPath=None, storeprefix=None): CDRVersion = 2.0 outFile = codecs.open(outPath +"/" + str(os.path.basename(f)), 'w', 'utf-8') if outPath else None with open(f, 'r') as fd: try: newDoc = {} c = fd.read() # fix for no request body out of Nutch CCA c.replace("\"body\" : null", "\"body\" : \"null\"") ccaDoc = json.loads(cbor.loads(c), encoding='utf8') newDoc["url"] = ccaDoc["url"] newDoc["timestamp"] = datetime.datetime.fromtimestamp(ccaDoc["imported"]) newDoc["team"] = team newDoc["crawler"] = crawler contentType = getContentType(ccaDoc) newDoc["content_type"] = contentType parsed = parser.from_buffer(ccaDoc["response"]["body"].encode("utf-8")) newDoc["crawl_data"] = {} if "content" in parsed: newDoc["extracted_text"] = parsed["content"] if 'inlinks' in ccaDoc and ccaDoc['inlinks']: newDoc["crawl_data"]["obj_parents"] = ccaDoc['inlinks'] newDoc["obj_parent"] = ccaDoc['inlinks'][0] # CDR version 2.0 additions newDoc["id"] = ccaDoc["key"] newDoc["obj_original_url"] = ccaDoc["url"] if 'text' in contentType or 'ml' in contentType: # web page newDoc["raw_content"] = ccaDoc["response"]["body"] else: # binary content, we link to store # ideally we should be storing it both the cases, but the CDR schema decided this way newDoc["obj_stored_url"] = url_to_nutch_dump_path(ccaDoc["url"], prefix=storeprefix) newDoc["extracted_metadata"] = parsed["metadata"] if 'metadata' in parsed else {} newDoc["version"] = CDRVersion verboseLog("Indexing ["+f+"] to Elasticsearch.") if url: indexDoc(url, newDoc, index, docType) if outFile: outFile.write(json.dumps(newDoc)) outFile.write("\n") print "Processed " + f + " successfully" procCount += 1 except Exception as err: failedList.append(f) failedReasons.append(str(err)) traceback.print_exc()
def digitize_pdf(file_path): file_data = [] _buffer = StringIO() data = parser.from_file(file_path, xmlContent=True) xhtml_data = BeautifulSoup(data['content'], features="lxml") for page, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})): #print('Parsing page {} of pdf file...'.format(page+1)) _buffer.write(str(content)) parsed_content = parser.from_buffer(_buffer.getvalue()) _buffer.truncate() file_data.append({'id': str(page+1), 'content': parsed_content['content']}) return file_data
async def worker(self, worker_id: int) -> None: """Main worker code. Listening to the beanstalk client for ready work, sending it to the tika service and then posting the extracted document to the queue for consumption by the analyzis module""" while True: logging.info("Worker [%s] waiting for work.", worker_id) try: job = await self.reserve() except greenstalk.TimedOutError: continue logging.info("Worker [%s] got job.", worker_id) try: meta_data = json.loads(job.body) except json.decoder.JSONDecodeError as err: logging.warning("Unable to decode body %s [%s ...]", err, job.body[:25]) self.client.delete(job) # type: ignore continue if "filename" not in meta_data: logging.warning("No 'filename' field in meta data : %s", meta_data) self.client.delete(job) # type: ignore continue if not os.path.isfile(meta_data["filename"]): logging.warning("Could not find file '%s'", meta_data["filename"]) self.client.delete(job) # type: ignore continue with open(meta_data["filename"], "rb") as fh: content = fh.read() if meta_data["filename"].endswith(".html"): content = html.unescape( content.decode("utf8")).encode("utf8") data = parser.from_buffer(content) data.update(meta_data) self.client.delete(job) # type: ignore logging.info("Worker [%s] waiting to post result.", worker_id) try: self.client.put( # type: ignore gzip.compress( json.dumps({ **data, **meta_data }).encode("utf8"))) logging.info("Worker [%s] job done.", worker_id) except greenstalk.JobTooBigError: logging.error("Job to big: %s.", meta_data["filename"])
def vaiNaFePDF(nome): raw = parser.from_buffer(nome) s = str(raw['content']) index = 0 top = 0 while s[top] == '\n': top += 1 botton = len(s) - 1 while s[botton] == '\n': botton -= 1 a = s[top:botton] print(a)
def get_page(link): page = http.request("GET", link) headers = page.headers if "Content-Type" in headers: if "application/pdf" in headers["Content-Type"] or link[-4:] == ".pdf": page_content = parser.from_buffer(page.data, TIKA_API)["content"] page_type = "pdf" else: # else treat as html page by default page_content = bs(page.data.decode('utf-8'), 'lxml') page_type = "html" root_url = get_root_url(link) return page_content, root_url, page_type
def process(self, ftl_doc, force): parsed_txt = None if force or not ftl_doc.count_pages: with ftl_doc.binary.open("rb") as ff: parsed_txt = parser.from_buffer(ff.read()) if "metadata" in parsed_txt and "xmpTPg:NPages" in parsed_txt[ "metadata"]: atomic_ftl_doc_update( ftl_doc.pid, { "count_pages": int( parsed_txt["metadata"]["xmpTPg:NPages"]) }, ) else: logger.warning( f"{self.log_prefix} Pages number can't be retrieved for document {ftl_doc.pid}" ) else: logger.debug( f"{self.log_prefix} Skipping Tika extract (page count) for document {ftl_doc.pid}" ) if force or not ftl_doc.content_text: if not parsed_txt: with ftl_doc.binary.open("rb") as ff: parsed_txt = parser.from_buffer(ff.read()) if "content" in parsed_txt and parsed_txt["content"]: atomic_ftl_doc_update( ftl_doc.pid, {"content_text": parsed_txt["content"].strip()}) else: logger.debug( f"{self.log_prefix} Skipping Tika extract (text) for document {ftl_doc.pid}" )
def parse_file(self, binary): try: parsed = parser.from_buffer(binary, "http://" + os.environ["pfe_tika_host"] + ":9998/tika") content_type = parsed["metadata"]["Content-Type"] content_type = content_type[1] if isinstance(content_type, list) else content_type # TODO: what to do in elastic search & in the ui when the index is empty ? # Answer: it disappear forever ... what to do then ? content = parsed["content"] if parsed["content"] is not None else "" except Exception as e: content = "" content_type = "UNKOWN" print("exception in parser.py" + str(e)) return content, content_type
def get_plain_text(link): """ :param link: link to a document (including an HTML doc), :return: plain text of that document """ print('Current link: {}'.format(link), end='\r') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/50.0.2661.102 Safari/537.36' } ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE if 'drive.google.com' in link: try: link = convert_google_url(link) except Exception as e: link = link print("Google Error:", e) try: with timeout(15, exception=RuntimeError): req = request.Request(link, headers=headers) buff = request.urlopen(req, context=ctx) parsed = parser.from_buffer(buff) return parsed['content'] pass except Exception as e: try: resp = requests.get(link) parsed = parser.from_buffer(resp.content) return parsed['content'] except: print('error:', e, 'link:', link) return 'UNAVAILABLE'
def __init__(self, path=None, content=None, tikaServer=None): if path is not None: parsed = parser.from_file(path, tikaServer) elif content is not None: parsed = parser.from_buffer(content, tikaServer) else: raise Exception("A path to the file or the content of the file must be provided.") self.data = "" if parsed["content"]: self.data = parsed["content"].strip() self.metadata = [] if "metadata" in parsed: self.metadata = parsed
def tika_parse(html, show_content=False): parsed = None try: ServerEndpoint = u"http://localhost:9998" parsed = parser.from_buffer(html, serverEndpoint=ServerEndpoint) n = 0 if show_content is True: for k, v in parsed[u"metadata"].items(): logger.debug(u" {} {} = {}".format(n, k, v)) n += 1 logger.debug(u" {} Content = {} ...".format(n, parsed[u"content"].strip())) except Exception, msg: logger.error(u"{}".format(msg))
def cleanseBody(theDoc): if not tika_support: print ( "cleanseBody requires Tika to be installed." "Please check the documentation on how to install " "ETLlib with Tika support." ) raise RuntimeError("Tika support not installed.") if "body" in theDoc: content = "<html>".encode('utf-8')+theDoc["body"].encode('utf-8')+"</html>".encode('utf-8') parsed = parser.from_buffer(content) for key,val in parsed["metadata"].iteritems(): if key not in theDoc: theDoc[key] = val else: theDoc["tika_"+key] = val theDoc["body"] = parsed["content"]
def run_exist_tool(dir_list, output_name, subtype): file_list = get_file_list(dir_list) data = [] for idx, val in enumerate(file_list): print(idx) with open(val) as input_file: mime_type = str() if subtype is not None: mime_type = subtype else: subtype = '' mime_type = detector.from_buffer(input_file) if mime_type is not None and mime_type.endswith(subtype): parsed = parser.from_buffer(input_file) if 'metadata' in parsed and parsed['metadata'] is not None: file_name = val.split('/')[-1] data.append({file_name: parsed['metadata']}) dump_to_json(output_name, data) return
rawText = pytesseract.image_to_string(Image.open(filename2), lang="rus") print (rawText) lines = rawText.split('\n') import os #os.putenv( 'TIKA_VERSION','default') # - set to the version string, e.g., 1.12 or default to current Tika version. #os.putenv( 'TIKA_SERVER_JAR','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to the full URL to the remote Tika server jar to download and cache. os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998') #- set to the host (local or remote) for the running Tika server jar. #os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar. #os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client. #os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation. #os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path. tika.initVM() from tika import parser parsed = parser.from_buffer("comme çi comme ça") print(parsed["metadata"]) print(parsed["content"]) global Verbose Verbose=True result=translate.auto_from_buffer("comme çi comme ça", 'en') print(result) result = detector.from_buffer("comme çi comme ça") print (result) result = translate.from_buffer("comme çi comme ça",'fr','en') print (result) result = language.from_buffer("comme çi comme ça") print (result) for line in lines: if len(line)>0:
data["responsestatus"]=responsestatus data["responseheader"]=responseheader #Get NER data docType = detector.from_file(path_to_file) if docType in tagRatioFileTypes: buffer = handleHtml(path_to_file,docType) else: try: buffer=subprocess.check_output(['java', '-jar', tikaSnapshotPath, '-t', path_to_file]) except: errorFile.write(path_to_file+"\n") continue if (buffer==None): errorFile.write(path_to_file+"\n") continue if (len(buffer)==0): errorFile.write(path_to_file+"\n") #continue parsedData=parser.from_buffer(buffer) metaData=formatMeta(parsedData,docType,buffer) data["NER"]=metaData measurementJson[path_to_file]=data json.dump(measurementJson,measurementFile,indent=4) measurementFile.close() errorFile.close()
def esIndex(ccaDir, team, crawler, index, docType, url=None, outPath=None, storeprefix=None): if not url and not outPath: raise Exception("Either Elastic Url or output path must be specified.") ccaJsonList = list_files(ccaDir) print "Processing ["+str(len(ccaJsonList))+"] files." procCount = 0 failedList=[] failedReasons=[] CDRVersion = 2.0 contentLengthThreshold = 150 fail_keys = ["warning", "return", "string"], ["error", "404"], ["domain", "expired"], ["annonse", "ble", "ikke", "funnet"], ["no", "se", "pudo", "encontrar", "el", "anuncio", "solicitado"] outFile = codecs.open(outPath, 'w', 'utf-8') if outPath else None for f in ccaJsonList: with open(f, 'r') as fd: try: newDoc = {} c = fd.read() # fix for no request body out of Nutch CCA c.replace("\"body\" : null", "\"body\" : \"null\"") verboseLog("Parsing [" + f + "]") ccaDoc = simplejson.loads(cbor.loads(c).value, encoding='utf8') if "Content-Type" in ccaDoc["response"]["headers"] and not ccaDoc["response"]["headers"]["Content-Type"].split(";")[0]: failedList.append(f) failedReasons.append("No Content Type Found") continue newDoc["url"] = ccaDoc["url"] newDoc["timestamp"] = ccaDoc["imported"] newDoc["team"] = team newDoc["crawler"] = crawler contentType = getContentType(ccaDoc) newDoc["content_type"] = contentType parsed = parser.from_buffer(ccaDoc["response"]["body"].encode("utf-8")) newDoc["crawl_data"] = {} if "content" in parsed: newDoc["crawl_data"]["content"] = parsed["content"] newDoc["extracted_text"] = parsed["content"] if 'inlinks' in ccaDoc and ccaDoc['inlinks']: newDoc["crawl_data"]["obj_parents"] = ccaDoc['inlinks'] newDoc["obj_parent"] = ccaDoc['inlinks'][0] # CDR version 2.0 additions newDoc["_id"] = ccaDoc["key"] newDoc["obj_original_url"] = ccaDoc["url"] if 'text' in contentType or 'ml' in contentType: # web page newDoc["raw_content"] = ccaDoc["response"]["body"] if "obj_parent" in newDoc: del newDoc["obj_parent"] else: # binary content, we link to store # ideally we should be storing it both the cases, but the CDR schema decided this way newDoc["obj_stored_url"] = url_to_nutch_dump_path(ccaDoc["url"], prefix=storeprefix) newDoc["extracted_metadata"] = parsed["metadata"] if 'metadata' in parsed else {} newDoc["version"] = CDRVersion # Validation Checks if "obj_parent" not in newDoc and ("raw_content" not in newDoc or not newDoc["raw_content"]): failedList.append(f) failedReasons.append("No Raw Content Found") continue if "obj_parent" not in newDoc and len(newDoc["raw_content"]) < contentLengthThreshold: failedList.append(f) failedReasons.append("Raw Content is less than " + str(contentLengthThreshold) + " characters") continue # Check for fail_keys if "obj_parent" not in newDoc and len(newDoc["raw_content"]) >= contentLengthThreshold: is_fail = False content_nopunct = remove_punctuation(newDoc["raw_content"]) content_list = content_nopunct.split(" ") for list in fail_keys: counter = 0 listlen = len(list) for item in list: if item in content_list: counter += 1 if counter >= listlen: is_fail = True if "request ad could not be" in content_nopunct: is_fail = True if is_fail: failedList.append(f) failedReasons.append("Raw Content indicates failed crawl") continue if "obj_parent" not in newDoc and ("extracted_text" not in newDoc or not newDoc["extracted_text"]): failedList.append(f) failedReasons.append("No Extracted Text Found") continue verboseLog("Indexing ["+f+"] to Elasticsearch.") if url: indexDoc(url, newDoc, index, docType) if outFile: outFile.write(simplejson.dumps(newDoc)) outFile.write("\n") procCount += 1 except Exception as err: failedList.append(f) failedReasons.append(str(err)) traceback.print_exc() if outFile: print("Output Stored at %s" % outPath) outFile.close() print "Processed " + str(procCount) + " CBOR files successfully." print "Failed files: " + str(len(failedList)) if _verbose: for i in range(len(failedList)): verboseLog("File: "+failedList[i]+" failed because "+failedReasons[i])
mergeJsonFile=open("/Users/charanshampur/PycharmProjects/assign2/Merged.json","r") sweetJsonFile=open("Sweet.json","w") jsonLoad=json.load(mergeJsonFile) def getSweetNer(metaData): metaDataFormatted={} for key,value in metaData.items(): if(re.match("NER",key)): metaDataFormatted[key]=value return metaDataFormatted sweetJson=[] for doc in jsonLoad: print doc["id"] sweetDict={} sweetDict.update(doc) if "content" in doc: parsedData=parser.from_buffer(doc["content"]) if "metadata" in parsedData: meta=getSweetNer(parsedData["metadata"]) if(len(meta)>0): sweetDict.update(meta) sweetJson.append(sweetDict) json.dump(sweetJson,sweetJsonFile,indent=4)