Ejemplo n.º 1
0
def computeScores(inputDir, outCSV, acceptTypes):

    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate", "y-coordinate", "Similarity_score"])

        files_tuple = itertools.combinations(
            filterFiles(inputDir, acceptTypes), 2)
        # for file1, file2 in files_tuple:
        for file1 in files_tuple:
            try:
                # row_cosine_distance = [file1, file2]
                arr = []
                with open(file1, 'rb') as csvfile:
                    reader = csv.reader(csvfile)
                    for row in reader:
                        arr.append(row)
                        # print ','.join(row)
                # row_cosine_distance = [file1]

                file1_parsedData = parser.from_buffer(arr[0])
                file2_parsedData = parser.from_buffer(arr[1])
                # file2_parsedData = parser.from_file(file2)

                v1 = Vector(file1, file1_parsedData["content"])
                v2 = Vector(file1, file2_parsedData["content"])

                row_cosine_distance.append(v1.cosTheta(v2))

                a.writerow(row_cosine_distance)
            except ConnectionError:
                sleep(1)
            except KeyError:
                continue
Ejemplo n.º 2
0
def get_content(r, headers, requestOptions=None):
    """Format downloaded pdf content."""
    f = BytesIO(r.content)
    if requestOptions is None:
        raw = parser.from_buffer(f, headers=headers)
    else:
        raw = parser.from_buffer(f,
                                 headers=headers,
                                 requestOptions=requestOptions)
    f.close()
    return raw
def esIndex(ccaDir, team, crawler, index, docType, url=None, outPath=None, storeprefix=None):
    if not url and not outPath:
        raise Exception("Either Elastic Url or output path must be specified.")
    ccaJsonList = list_files(ccaDir)
    print "Processing ["+str(len(ccaJsonList))+"] files."

    procCount = 0
    failedList=[]
    failedReasons=[]
    CDRVersion = 2.0
    outFile = codecs.open(outPath, 'w', 'utf-8') if outPath else None

    for f in ccaJsonList:
        with open(f, 'r') as fd:
            try:
                newDoc = {}
                c = fd.read()
                # fix for no request body out of Nutch CCA
                c.replace("\"body\" : null", "\"body\" : \"null\"")
                ccaDoc = json.loads(cbor.loads(c).value, encoding='utf8')
                newDoc["url"] = ccaDoc["url"]

                newDoc["timestamp"] = ccaDoc["imported"]
                newDoc["team"] = team
                newDoc["crawler"] = crawler
                newDoc["raw_content"] = ccaDoc["response"]["body"]
                newDoc["content_type"] = getContentType(ccaDoc)
                parsed = parser.from_buffer(newDoc["raw_content"].encode("utf-8"))
                newDoc["crawl_data"] = {}
                if "content" in parsed:
                    newDoc["crawl_data"]["content"] = parsed["content"]
                    newDoc["extracted_text"] = parsed["content"]

                # CDR version 2.0 additions
                newDoc["_id"] = ccaDoc["key"]
                newDoc["obj_original_url"] = ccaDoc["url"]
                # newDoc["obj_parent"] = ??? Missing # TODO: get this field some how!
                newDoc["obj_stored_url"] = url_to_nutch_dump_path(ccaDoc["url"], prefix=storeprefix)
                newDoc["extracted_metadata"] = parsed["metadata"] if 'metadata' in parsed else {}
                newDoc["version"] = CDRVersion
                verboseLog("Indexing ["+f+"] to Elasticsearch.")
                if url:
                    indexDoc(url, newDoc, index, docType)
                if outFile:
                    outFile.write(json.dumps(newDoc))
                    outFile.write("\n")
                procCount += 1
            except Exception as err:
                failedList.append(f)
                failedReasons.append(str(err))
                traceback.print_exc()
    if outFile:
        print("Output Stored at %s" % outPath)
        outFile.close()
    print "Processed " + str(procCount) + " CBOR files successfully."
    print "Failed files: " + str(len(failedList))

    if _verbose:
        for i in range(len(failedList)):
            verboseLog("File: "+failedList[i]+" failed because "+failedReasons[i])
Ejemplo n.º 4
0
def textExtractor(URL):
    # Default timeout 10 seconds
    ok = False
    text = ""
    try:
        text = ur.urlopen(URL).read()
    except Exception as e:
        print("URL open Failed:", e)
    
    if len(text) == 0:
        try:
            r = requests.get(URL, headers={'User-Agent': 'Mozilla/5.0'})
            if r.status_code == 200:
                text = r.text
            else:
                return ok, ""
        except Exception as e:
            print("requests open Failed:", e)

    visible_text = ""
    if USE_TIKA:
        from tika import parser
        visible_text = parser.from_buffer(text)['content']
    else:
        soup = BeautifulSoup(text, "html.parser")
        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        visible_text = soup.getText()
    ok = True

    return ok, [visible_text.encode('ascii', 'replace').decode('ascii').replace('\n','.')]
Ejemplo n.º 5
0
    def extract(self, request: ExtractorRequest = None) -> ExtractorResponse:
        try:
            # request options to the document url and the tika server where we send the bytes to
            # see: https://requests.kennethreitz.org/en/master/api/#requests.request
            tika_req_options = {"timeout": 15}
            document_req_options = {"timeout": 15, "allow_redirects": True}

            if request.url:
                resp = requests.get(request.url, **document_req_options)

                buffer = BytesIO(resp.content)
                parsed = parser.from_buffer(buffer,
                                            requestOptions=tika_req_options)
            else:
                parsed = parser.from_file(request.filename,
                                          requestOptions=tika_req_options)

            text = parsed["content"] or ""
            # merge parsed metadata with source info
            meta = {**parsed["metadata"], **{"source": "document"}}

            # construct response
            response = ExtractorResponse(meta=meta, text=text)
        except Exception as e:
            msg = f"Error extracting text via Tika extractor: '{str(e)}'"
            log.error(msg)
            response = ExtractorResponse(error=msg)

        return response
Ejemplo n.º 6
0
    def craft(self, uri: str, buffer: bytes, *args, **kwargs):
        from tika import parser
        headers = {
            'X-Tika-PDFOcrStrategy': self.tika_ocr_strategy,
            'X-Tika-PDFextractInlineImages': self.tika_extract_inline_images,
            'X-Tika-OCRLanguage': self.tika_ocr_language
        }
        request_options = {'timeout': self.tika_request_timeout}

        text = ""
        if buffer:
            result = parser.from_buffer(string=buffer,
                                        serverEndpoint=TIKA_URL,
                                        xmlContent=False,
                                        headers=headers,
                                        requestOptions=request_options)
        elif uri:
            result = parser.from_file(filename=uri,
                                      serverEndpoint=TIKA_URL,
                                      service='all',
                                      xmlContent=False,
                                      headers=headers,
                                      requestOptions=request_options)
        else:
            raise ValueError('No value found in "buffer" and "uri"')

        if 'status' in result:
            if result['status'] == 200:
                text = result['content']

        return dict(text=text)
Ejemplo n.º 7
0
def parallel_requests_test():
    alec_bills = [
        json.loads(x) for x in open(
            "{0}/model_legislation/alec_bills.json".format(DATA_PATH))
    ]
    test_queries = [base64.b64decode(s['source']) for s in alec_bills]
    pattern = re.compile("[0-9]\.\s.*")
    for i, t in enumerate(test_queries):
        test_queries[i] = tp.from_buffer(t)['content']
        test_queries[i] = " ".join(re.findall(pattern, test_queries[i]))
        #test_queries[i] = test_queries[i].split()
        #test_queries[i] = " ".join(test_queries[i][0:200])

    test_queries = test_queries[0:100]
    ec = ElasticConnection()
    serial_time = time.time()
    for test_query in test_queries:
        ec.similar_doc_query(test_query)

    print("serial time:  ", time.time() - serial_time)
    pool = Pool(processes=7)
    parallel_time = time.time()
    pool.map(parallel_query, test_queries)
    print("parallel time:  ", time.time() - parallel_time)
    exit()
Ejemplo n.º 8
0
def extract_model_legislation(json_file, encoded):
    '''
    Keyword Args: 
    json_file: corresponds to json file with model legislation
    encoded: True/False if json file is b64 encoded

    returns:
        dictionary with url, date, and text of model legislation 
    decription:
        extract text from model legislation  
    '''
    data = []
    with open(json_file) as f:
        for line in f:
            data.append(json.loads(line))

    model_legislation = {}
    for i in range(len(data)):
        model_legislation[i] = data[i]

    if encoded == True:
        for i in range(len(model_legislation)):
            try:
                ml = model_legislation[i]['source']
                ml = base64.b64decode(ml)
                ml = tp.from_buffer(ml)
                model_legislation[i]['source'] = ml['content']
            except AttributeError:
                model_legislation[i]['source'] = None
        return model_legislation

    else:
        return model_legislation
def process():
    uploaded_doc = request.files.get('document')

    if uploaded_doc is None:
        return 'Kolom dokumen harus diisi.'

    name, ext = os.path.splitext(uploaded_doc.filename)
    if ext not in ('.pdf'):
        return 'Berkas dokumen wajib bertipe PDF.'

    input_text = parser.from_buffer(uploaded_doc.file)["content"]

    algorithm_identifiers = request.forms.get("algorithm").split("-")
    algorithm_id = algorithm_identifiers[0]
    algorithm_fold = algorithm_identifiers[1]

    model = joblib.load(get_model_file_name(algorithm_id, algorithm_fold))

    vectorizer = joblib.load(get_vectorizer_file_name(algorithm_fold))

    answer = model.predict(vectorizer.transform([input_text]).toarray())

    response.set_cookie(
        "message", "Dokumen {} termasuk klasifikasi {}".format(
            uploaded_doc.filename,
            answer[0],
        ))

    return redirect("/")
Ejemplo n.º 10
0
def query_time_speed_test():
    from tika import parser as tp
    import re
    import numpy as np

    alec_bills = [
        json.loads(x) for x in open(
            "{0}/model_legislation/alec_bills.json".format(DATA_PATH))
    ]
    test_queries = [base64.b64decode(s['source']) for s in alec_bills]
    pattern = re.compile("[0-9]\.\s.*")
    for i, t in enumerate(test_queries):
        test_queries[i] = tp.from_buffer(t)['content']
        test_queries[i] = " ".join(re.findall(pattern, test_queries[i]))
        test_queries[i] = test_queries[i].split()

    test_queries = [x for x in test_queries if len(x) >= 1500]
    query_sizes = np.arange(50, 1050, 50)
    ec = ElasticConnection()
    avg_times = []
    for query_size in query_sizes:
        temp_times = []
        for query in test_queries:
            query = " ".join(query[0:query_size])
            t1 = time.time()
            ec.similar_doc_query(query, num_results=1000)
            temp_times.append(time.time() - t1)

        avg_times.append(np.mean(temp_times))
        print("query size {0} , avg time (s) {1}".format(
            query_size, np.mean(temp_times)))

    for i in avg_times:
        print(i)
Ejemplo n.º 11
0
def extract_text_from_buffer(buffer, exclude_commons=False):
    buffer.seek(0)
    raw = parser.from_buffer(buffer)
    content = raw['content'].split('\n')
    content = [row for row in content if row != ""]

    return content
Ejemplo n.º 12
0
    def handle_dirty_docs(self):
        dirty_docs = self.docstore.select(
            butter_user_id=self.butter_user_id,
            datasource_user_id=self.datasource_user_id,
            dirty=True)

        for doc in dirty_docs:
            result = self.driver.retrieve_data(doc)

            if result.data or result.unicode_data:
                doc['content'] = parser.from_buffer(result.data)

            doc['dirty'] = False
            self.docstore.update_raw([doc])
            update_docs = [doc] + result.docs

            for d in [d for d in result.docs if 'butter_user_id' not in d]:
                d['butter_user_id'] = doc['butter_user_id']
                d['datasource_user_id'] = doc['datasource_user_id']
            self.docstore.update_raw(update_docs)

            if result.should_remove_doc:
                self.docstore.delete(doc)

            if result.should_remove_children:
                # handle any docs to be deleted
                self.remove_child_docs(doc, self.butter_user_id,
                                       self.datasource_user_id,
                                       [d['id'] for d in result.docs])
Ejemplo n.º 13
0
def remove_header_footer(xml_data):
    # Function to remove header and footer from the text data
    xhtml_data = BeautifulSoup(xml_data['content'])
    page0 = None
    all_pages = ""
    total_pages = 0
    all_pages_enum = xhtml_data.find_all('div', attrs={'class': 'page'})
    if len(all_pages_enum) <= 0:
        return xhtml_data.getText()
    total_pages = len(all_pages_enum)
    page_to_select = int(round(total_pages / 2))

    for page, content in enumerate(all_pages_enum):
        _buffer = StringIO()
        _buffer.write(str(content))
        parsed_content = parser.from_buffer(_buffer.getvalue())
        all_pages = all_pages + parsed_content['content'].strip()
        if page == page_to_select:
            page0 = parsed_content['content']

    first_page_split = page0.split("\n")

    for page_str in first_page_split:
        if len(page_str.rstrip()) > 0:
            page_str = re.escape(page_str)
            page_str = re.sub(r'\d+', '\\\\d+', page_str)
            list_of_instances = [
                m.start() for m in re.finditer(page_str + "[\r\n]", all_pages)
            ]
            if len(list_of_instances) > round(total_pages / 2):
                all_pages = re.sub(page_str + "[\r\n]", '', all_pages)
    return all_pages
Ejemplo n.º 14
0
def get_task():
    if 'file' not in request.files:
        abort(400)
    file = request.files['file']
    if file.filename == '':
        abort(400)
    if file and allowed_file(file.filename):
        try:
            text = parser.from_buffer(file.read())
        except Exception as e:
            return (str(e))

        result = {
            "lang": detect(text['content']),
            "text": text['content'].replace('\n', '').replace('\t', ''),
            "size": True
        }
        if (text['content'].split(" ").__len__() > 50000):
            logging.error("Document length exceeded limit. No. of chars: ")
            logging.error(text['content'].split(" ").__len__())
            result["size"] = False

        return Response(json.dumps(result), mimetype='application/json')

    abort(400)
    return 'Something went wrong, try again!'
def esIndex(ccaDir, team, crawler, url, index, docType):
    ccaJsonList = list_files(ccaDir)
    print "Processing ["+str(len(ccaJsonList))+"] files."

    procList=[]
    failedList=[]
    failedReasons=[]

    for f in ccaJsonList:
        ccaDoc = None
        newDoc = {}
        with open(f, 'r') as fd:
            try:
                c = fd.read()
                # fix for no request body out of Nutch CCA
                c.replace("\"body\" : null", "\"body\" : \"null\"")
                ccaDoc = json.loads(cbor.loads(c), encoding='utf8')
                newDoc["url"] = ccaDoc["url"]
                newDoc["timestamp"] = ccaDoc["imported"]
                newDoc["team"] = team
                newDoc["crawler"] = crawler
                newDoc["raw_content"] = ccaDoc["response"]["body"]
                newDoc["content_type"] = getContentType(ccaDoc)
                parsed = parser.from_buffer(newDoc["raw_content"].encode("utf-8"))
                newDoc["crawl_data"] = {}
                newDoc["crawl_data"]["content"] = parsed["content"]
                verboseLog("Indexing ["+f+"] to Elasticsearch.")
                indexDoc(url, newDoc, index, docType)
                procList.append(f)
            except ValueError, err:
                failedList.append(f)
                failedReasons.append(str(err))
Ejemplo n.º 16
0
def parse_and_enqueue(buffer: bytes, aptnote: dict, parser, queue: Queue,
                      condition: Condition) -> None:
    parsed_buffer: dict = parser.from_buffer(buffer)
    augmented_aptnote: dict = augment_aptnote(aptnote, parsed_buffer)
    with condition:
        queue.put(augmented_aptnote)
        condition.notify()
Ejemplo n.º 17
0
    def getImageFeatures(self, params):
        try:
            import cv2
            import numpy as np
            cv2_available = True
        except ImportError:
            cv2_available = False

        if 'url' in params:
            data = requests.get(params['url'], verify=False).content
        else:
            data = str(cherrypy.request.body.read())

        # Run Tika once
        parsed = parser.from_buffer(data)
        tika = {}
        for (k, v) in parsed["metadata"].iteritems():
            k = k.lower().replace(':', '_').replace(' ', '_').replace('-', '_')
            tika[k] = v
        tika['content'] = parsed["content"]

        if cv2_available:
            file_bytes = np.asarray(bytearray(data), dtype=np.uint8)
            image = cv2.imdecode(file_bytes, flags=cv2.CV_LOAD_IMAGE_UNCHANGED);

            if image is not None:
                if len(image.shape) < 3 or image.shape[2] == 1:
                    image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)

                v = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
                v = v.flatten()
                hist = v / sum(v)
                tika['histogram'] = hist.tolist()

        return tika
Ejemplo n.º 18
0
def review_pdf(urls):
    for url in urls:
        pdf_obj = requests.get(url, stream=True)
        raw_xml = parser.from_buffer(pdf_obj.content, xmlContent=True)
        # raw_xml = parser.from_file(file, xmlContent=True)
        body = raw_xml['content'].split('<body>')[1].split('</body>')[0]
        #print(body)
        body_without_tag = body.replace("<p>", "").replace("</p>", "").replace("<div>", "").replace("</div>","").replace("<p />","")
        text_pages = body_without_tag.split("""<div class="page">""")[1:]
        num_pages = len(text_pages)
        if num_pages==int(raw_xml['metadata']['xmpTPg:NPages']) : #check if it worked correctly
            print(num_pages)
        pdf_lines = []
        for page_num in range(num_pages):
            if re.search("Review of Performance", text_pages[page_num], re.IGNORECASE):
                skipHeadings = True
                lines = text_pages[page_num].split("\n")
                for line in lines:
                    if skipHeadings:
                        if re.search("Review of Performance", line, re.IGNORECASE):
                            skipHeadings = False
                        if skipHeadings:
                            continue
                    print(line)
                    pdf_lines.append(line)
    return pdf_lines
Ejemplo n.º 19
0
 def fetch(self, no):
     logger.debug(f'Fetching Actualización nº {no}')
     r = requests.get(
         f'https://www.mscbs.gob.es/profesionales/saludPublica/ccayes/'
         f'alertasActual/nCov-China/documentos/Actualizacion_{no}_COVID-19.pdf'
     )
     return parser.from_buffer(r.content) if r.status_code == 200 else None
def pdf_to_txt(full_path):
    """Turns all text from pdf to raw string

    full_path: string, full path to pdf file to convert
    """
    file = open(full_path, 'rb')
    extracted_text = parser.from_buffer(file)
    return extracted_text['content']
Ejemplo n.º 21
0
def pdf2text(raw):
  text = None
  try:
    content = parser.from_buffer(raw)
    if content['status'] == 200:
      text = content['content']
  except: text = None
  return text
Ejemplo n.º 22
0
 def _is_whitepaper(self, pdf, company):
     raw_pdf = parser.from_buffer(pdf)
     raw_text = raw_pdf['content']
     if raw_text and company.lower() in raw_text.lower() and (
             'whitepaper' in raw_text.lower()
             or 'white paper' in raw_text.lower()):
         return True
     else:
         return False
Ejemplo n.º 23
0
 def findFrequencyDistribution(self, request, context):
     idDocument = request.document.idDocument
     file = request.document.file
     totalPalavras = request.total
     # Reading document
     parsed = parser.from_buffer(file)
     content = parsed['content']
     return extractor.generate_frequency_distribution(id=idDocument,
                                                      content=content,
                                                      total=totalPalavras)
Ejemplo n.º 24
0
def grab_more_eval_bills():
    with open('../../data/evaluation_set/bills_for_evaluation_set.csv') as f:
        bills_list = [row for row in csv.reader(f.read().splitlines())]
        
    bill_ids_list = []
    url_lists = []
    topic_list = []
    for i in range(len(bills_list)):
        state = bills_list[i][1]
        if state == 'ct':
            continue
        topic = bills_list[i][0]
        bill_number = bills_list[i][2]
        bill_number = re.sub(' ', '', bill_number)
        year = bills_list[i][3]
        url = bills_list[i][6]
        unique_id = str(state + '_' + year + '_' + bill_number)
        topic_list.append(topic)
        bill_ids_list.append(unique_id)
        url_lists.append(url)

    bills_ids = zip(bill_ids_list, url_lists)

    bad_count = 0
    bills_text = []
    state_list = []
    for i in range(len(bills_ids)):
        try:
            bill_text = get_bill_by_id(bills_ids[i][0])
        except IndexError:
            try:
                url = bills_ids[i][1]
                doc = urllib.urlopen(url).read()
                bill_text = parser.from_buffer(doc)['content']
                print url
            except IOError:
            	bad_count += 1 
            	print 'bad_count: ', bad_count
            	#skip this case
                continue
        bills_text.append(bill_text)
        state = bills_ids[i][0][0:2]
        state_list.append(state)

    bills_state = zip(bills_text, state_list, topic_list)

    bill_type_1 = []
    bill_type_2 = []
    for bill in bills_state:
        if bill[-1] == 'Adult Guardianship and Protective Proceedings Jurisdiction Act':
            bill_type_1.append((bill[0],bill[1]))
        else:
            bill_type_2.append((bill[0],bill[1]))

    return [bill_type_2, bill_type_1]
Ejemplo n.º 25
0
def create_bills(ls):
    '''
    args:
        ls: list of lists of urls that correspond to matches

    returns:
        dictionary grouped by matches
    '''
    k = 0
    bill_id = 0
    bills = {}
    bad_count = 0
    for urls in ls:
        for url,state in urls:
            try:
                print "bill_id: " + str(bill_id)
                bills[bill_id] = {}
                doc = urllib2.urlopen(url).read()
                text = parser.from_buffer(doc)['content'] 
                bills[bill_id]['url'] = url
                bills[bill_id]['text'] = text
                bills[bill_id]['match'] = k
                bills[bill_id]['state'] = state
            except:
                pass
                bad_count += 1
                print 'bad_count: ', bad_count
            bill_id += 1
        k += 1

    #get more evaluation bills
    eval_bills = grab_more_eval_bills()
    for more_bills in eval_bills:
        print 'bill_group: ' k
        k +=1
        for text, state in more_bills:
            bill_id += 1
            print 'bill_id: ', i

            bills[bill_id] = {}
            bills[bill_id]['text'] = text
            bills[bill_id]['state'] = state  
            bills[bill_id]['match'] = k

    try:
        for bill in bills.keys():
            if bills[bill] == {} or bills[bill]['text'] == '' \
                or bills[bill]['text'] == None:
                
                del bills[bill]
    except:
        pass

    return bills
Ejemplo n.º 26
0
def tika_read(pdf_file_path):
    pages, _buffer = [], StringIO()
    data = parser.from_file(pdf_file_path, xmlContent=True)
    xhtml_data = BeautifulSoup(data['content'], features="lxml")
    for content in xhtml_data.find_all('div', attrs={'class': 'page'}):
        _buffer.write(str(content))
        parsed_content = parser.from_buffer(_buffer.getvalue())
        _buffer.truncate()
        pages.append(parsed_content['content'])

    return pages
Ejemplo n.º 27
0
def esIndexDoc(f, team, crawler, index, docType, failedList, failedReasons, procCount,
               url=None, outPath=None, storeprefix=None):
    CDRVersion = 2.0
    outFile = codecs.open(outPath +"/" + str(os.path.basename(f)), 'w', 'utf-8') if outPath else None
    with open(f, 'r') as fd:
            try:
                newDoc = {}
                c = fd.read()
                # fix for no request body out of Nutch CCA
                c.replace("\"body\" : null", "\"body\" : \"null\"")
                ccaDoc = json.loads(cbor.loads(c), encoding='utf8')
                newDoc["url"] = ccaDoc["url"]

                newDoc["timestamp"] = datetime.datetime.fromtimestamp(ccaDoc["imported"])
                newDoc["team"] = team
                newDoc["crawler"] = crawler

                contentType = getContentType(ccaDoc)
                newDoc["content_type"] = contentType

                parsed = parser.from_buffer(ccaDoc["response"]["body"].encode("utf-8"))
                newDoc["crawl_data"] = {}
                if "content" in parsed:
                    newDoc["extracted_text"] = parsed["content"]
                if 'inlinks' in ccaDoc and ccaDoc['inlinks']:
                    newDoc["crawl_data"]["obj_parents"] = ccaDoc['inlinks']
                    newDoc["obj_parent"] = ccaDoc['inlinks'][0]
                # CDR version 2.0 additions
                newDoc["id"] = ccaDoc["key"]
                newDoc["obj_original_url"] = ccaDoc["url"]

                if 'text' in contentType or 'ml' in contentType:
                    # web page
                    newDoc["raw_content"] = ccaDoc["response"]["body"]
                else:
                    # binary content, we link to store
                    # ideally we should be storing it both the cases, but the CDR schema decided this way
                    newDoc["obj_stored_url"] = url_to_nutch_dump_path(ccaDoc["url"], prefix=storeprefix)

                newDoc["extracted_metadata"] = parsed["metadata"] if 'metadata' in parsed else {}
                newDoc["version"] = CDRVersion
                verboseLog("Indexing ["+f+"] to Elasticsearch.")
                if url:
                    indexDoc(url, newDoc, index, docType)
                if outFile:
                    outFile.write(json.dumps(newDoc))
                    outFile.write("\n")
                    print "Processed " + f + " successfully"
                procCount += 1
            except Exception as err:
                failedList.append(f)
                failedReasons.append(str(err))
                traceback.print_exc()
Ejemplo n.º 28
0
def digitize_pdf(file_path):
    file_data = []
    _buffer = StringIO()
    data = parser.from_file(file_path, xmlContent=True)
    xhtml_data = BeautifulSoup(data['content'], features="lxml")
    for page, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
        #print('Parsing page {} of pdf file...'.format(page+1))
        _buffer.write(str(content))
        parsed_content = parser.from_buffer(_buffer.getvalue())
        _buffer.truncate()
        file_data.append({'id': str(page+1), 'content': parsed_content['content']})
    return file_data
Ejemplo n.º 29
0
    async def worker(self, worker_id: int) -> None:
        """Main worker code. Listening to the beanstalk client for ready work, sending it to the
        tika service and then posting the extracted document to the queue for consumption by
        the analyzis module"""

        while True:
            logging.info("Worker [%s] waiting for work.", worker_id)
            try:
                job = await self.reserve()
            except greenstalk.TimedOutError:
                continue

            logging.info("Worker [%s] got job.", worker_id)
            try:
                meta_data = json.loads(job.body)
            except json.decoder.JSONDecodeError as err:
                logging.warning("Unable to decode body %s [%s ...]", err,
                                job.body[:25])
                self.client.delete(job)  # type: ignore
                continue

            if "filename" not in meta_data:
                logging.warning("No 'filename' field in meta data : %s",
                                meta_data)
                self.client.delete(job)  # type: ignore
                continue

            if not os.path.isfile(meta_data["filename"]):
                logging.warning("Could not find file '%s'",
                                meta_data["filename"])
                self.client.delete(job)  # type: ignore
                continue

            with open(meta_data["filename"], "rb") as fh:
                content = fh.read()
                if meta_data["filename"].endswith(".html"):
                    content = html.unescape(
                        content.decode("utf8")).encode("utf8")
                data = parser.from_buffer(content)
                data.update(meta_data)

            self.client.delete(job)  # type: ignore
            logging.info("Worker [%s] waiting to post result.", worker_id)
            try:
                self.client.put(  # type: ignore
                    gzip.compress(
                        json.dumps({
                            **data,
                            **meta_data
                        }).encode("utf8")))
                logging.info("Worker [%s] job done.", worker_id)
            except greenstalk.JobTooBigError:
                logging.error("Job to big: %s.", meta_data["filename"])
def vaiNaFePDF(nome):
    raw = parser.from_buffer(nome)
    s = str(raw['content'])
    index = 0
    top = 0
    while s[top] == '\n':
        top += 1
    botton = len(s) - 1
    while s[botton] == '\n':
        botton -= 1

    a = s[top:botton]
    print(a)
Ejemplo n.º 31
0
def get_page(link):
    page = http.request("GET", link)
    headers = page.headers
    if "Content-Type" in headers:
        if "application/pdf" in headers["Content-Type"] or link[-4:] == ".pdf":
            page_content = parser.from_buffer(page.data, TIKA_API)["content"]
            page_type = "pdf"
        else:
            # else treat as html page by default
            page_content = bs(page.data.decode('utf-8'), 'lxml')
            page_type = "html"
    root_url = get_root_url(link)
    return page_content, root_url, page_type
Ejemplo n.º 32
0
    def process(self, ftl_doc, force):
        parsed_txt = None

        if force or not ftl_doc.count_pages:
            with ftl_doc.binary.open("rb") as ff:
                parsed_txt = parser.from_buffer(ff.read())

            if "metadata" in parsed_txt and "xmpTPg:NPages" in parsed_txt[
                    "metadata"]:
                atomic_ftl_doc_update(
                    ftl_doc.pid,
                    {
                        "count_pages": int(
                            parsed_txt["metadata"]["xmpTPg:NPages"])
                    },
                )
            else:
                logger.warning(
                    f"{self.log_prefix} Pages number can't be retrieved for document {ftl_doc.pid}"
                )

        else:
            logger.debug(
                f"{self.log_prefix} Skipping Tika extract (page count) for document {ftl_doc.pid}"
            )

        if force or not ftl_doc.content_text:
            if not parsed_txt:
                with ftl_doc.binary.open("rb") as ff:
                    parsed_txt = parser.from_buffer(ff.read())

            if "content" in parsed_txt and parsed_txt["content"]:
                atomic_ftl_doc_update(
                    ftl_doc.pid,
                    {"content_text": parsed_txt["content"].strip()})
        else:
            logger.debug(
                f"{self.log_prefix} Skipping Tika extract (text) for document {ftl_doc.pid}"
            )
Ejemplo n.º 33
0
 def parse_file(self, binary):
     try:
         parsed = parser.from_buffer(binary, "http://" + os.environ["pfe_tika_host"] + ":9998/tika")
         content_type = parsed["metadata"]["Content-Type"]
         content_type = content_type[1] if isinstance(content_type, list) else content_type
         # TODO: what to do in elastic search & in the ui when the index is empty ?
         #  Answer: it disappear forever ... what to do then ?
         content = parsed["content"] if parsed["content"] is not None else ""
     except Exception as e:
         content = ""
         content_type = "UNKOWN"
         print("exception in parser.py" + str(e))
     return content, content_type
Ejemplo n.º 34
0
def get_plain_text(link):
    """

    :param link: link to a document (including an HTML doc),
    :return: plain text of that document
    """
    print('Current link: {}'.format(link), end='\r')
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/50.0.2661.102 Safari/537.36'
    }
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    if 'drive.google.com' in link:
        try:
            link = convert_google_url(link)
        except Exception as e:
            link = link
            print("Google Error:", e)
    try:
        with timeout(15, exception=RuntimeError):
            req = request.Request(link, headers=headers)
            buff = request.urlopen(req, context=ctx)
            parsed = parser.from_buffer(buff)
            return parsed['content']
            pass

    except Exception as e:
        try:
            resp = requests.get(link)
            parsed = parser.from_buffer(resp.content)
            return parsed['content']
        except:
            print('error:', e, 'link:', link)
            return 'UNAVAILABLE'
Ejemplo n.º 35
0
    def __init__(self, path=None, content=None, tikaServer=None):
        if path is not None:
            parsed = parser.from_file(path, tikaServer)
        elif content is not None:
            parsed = parser.from_buffer(content, tikaServer)
        else:
            raise Exception("A path to the file or the content of the file must be provided.")

        self.data = ""
        if parsed["content"]:
            self.data = parsed["content"].strip()

        self.metadata = []
        if "metadata" in parsed:
            self.metadata = parsed
Ejemplo n.º 36
0
def tika_parse(html, show_content=False):
    parsed = None
    try:
        ServerEndpoint = u"http://localhost:9998"
        parsed = parser.from_buffer(html, serverEndpoint=ServerEndpoint)

        n = 0
        if show_content is True:
            for k, v in parsed[u"metadata"].items():
                logger.debug(u"    {} {} = {}".format(n, k, v))
                n += 1

            logger.debug(u"  {} Content = {} ...".format(n, parsed[u"content"].strip()))

    except Exception, msg:
        logger.error(u"{}".format(msg))
Ejemplo n.º 37
0
def cleanseBody(theDoc):
    if not tika_support:
        print (
            "cleanseBody requires Tika to be installed."
            "Please check the documentation on how to install "
            "ETLlib with Tika support."
        )
        raise RuntimeError("Tika support not installed.")

    if "body" in theDoc:
        content = "<html>".encode('utf-8')+theDoc["body"].encode('utf-8')+"</html>".encode('utf-8')
        parsed = parser.from_buffer(content)
        for key,val in parsed["metadata"].iteritems():
            if key not in theDoc:
                theDoc[key] = val
            else:
                theDoc["tika_"+key] = val
        theDoc["body"] = parsed["content"]
Ejemplo n.º 38
0
def run_exist_tool(dir_list, output_name, subtype):
    file_list = get_file_list(dir_list)
    data = []
    for idx, val in enumerate(file_list):
        print(idx)
        with open(val) as input_file:
            mime_type = str()
            if subtype is not None:
                mime_type = subtype
            else:
                subtype = ''
                mime_type = detector.from_buffer(input_file)
            if mime_type is not None and mime_type.endswith(subtype):
                parsed = parser.from_buffer(input_file)
                if 'metadata' in parsed and parsed['metadata'] is not None:
                    file_name = val.split('/')[-1]
                    data.append({file_name: parsed['metadata']})

    dump_to_json(output_name, data)
    return
Ejemplo n.º 39
0
rawText = pytesseract.image_to_string(Image.open(filename2), lang="rus")
print (rawText)
lines = rawText.split('\n')

import os
#os.putenv( 'TIKA_VERSION','default')  # - set to the version string, e.g., 1.12 or default to current Tika version.
#os.putenv( 'TIKA_SERVER_JAR','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to the full URL to the remote Tika server jar to download and cache.
os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998') #- set to the host (local or remote) for the running Tika server jar.
#os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar.
#os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client.
#os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation.
#os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path.
tika.initVM()
from tika import parser
parsed = parser.from_buffer("comme çi comme ça")
print(parsed["metadata"])
print(parsed["content"])
global Verbose
Verbose=True

result=translate.auto_from_buffer("comme çi comme ça", 'en')
print(result)
result = detector.from_buffer("comme çi comme ça")
print (result)
result = translate.from_buffer("comme çi comme ça",'fr','en')
print (result)
result = language.from_buffer("comme çi comme ça")
print (result)
for line in lines:
    if len(line)>0:
            data["responsestatus"]=responsestatus
            data["responseheader"]=responseheader

            #Get NER data
            docType = detector.from_file(path_to_file)
            if docType in tagRatioFileTypes:
                buffer = handleHtml(path_to_file,docType)
            else:
                try:
                    buffer=subprocess.check_output(['java', '-jar', tikaSnapshotPath, '-t', path_to_file])
                except:
                    errorFile.write(path_to_file+"\n")
                    continue
            if (buffer==None):
                errorFile.write(path_to_file+"\n")
                continue
            if (len(buffer)==0):
                    errorFile.write(path_to_file+"\n")
                    #continue

            parsedData=parser.from_buffer(buffer)

            metaData=formatMeta(parsedData,docType,buffer)
            data["NER"]=metaData

            measurementJson[path_to_file]=data

json.dump(measurementJson,measurementFile,indent=4)
measurementFile.close()
errorFile.close()
def esIndex(ccaDir, team, crawler, index, docType, url=None, outPath=None, storeprefix=None):
    if not url and not outPath:
        raise Exception("Either Elastic Url or output path must be specified.")
    ccaJsonList = list_files(ccaDir)
    print "Processing ["+str(len(ccaJsonList))+"] files."

    procCount = 0
    failedList=[]
    failedReasons=[]
    CDRVersion = 2.0
    contentLengthThreshold = 150
    fail_keys = ["warning", "return", "string"], ["error", "404"], ["domain", "expired"], ["annonse", "ble", "ikke",
                                                                                           "funnet"], ["no", "se",
                                                                                                       "pudo",
                                                                                                       "encontrar",
                                                                                                       "el", "anuncio",
                                                                                                       "solicitado"]

    outFile = codecs.open(outPath, 'w', 'utf-8') if outPath else None

    for f in ccaJsonList:
        with open(f, 'r') as fd:
            try:
                newDoc = {}
                c = fd.read()
                # fix for no request body out of Nutch CCA
                c.replace("\"body\" : null", "\"body\" : \"null\"")
                verboseLog("Parsing [" + f + "]")
                ccaDoc = simplejson.loads(cbor.loads(c).value, encoding='utf8')

                if "Content-Type" in ccaDoc["response"]["headers"] and not ccaDoc["response"]["headers"]["Content-Type"].split(";")[0]:
                    failedList.append(f)
                    failedReasons.append("No Content Type Found")
                    continue

                newDoc["url"] = ccaDoc["url"]

                newDoc["timestamp"] = ccaDoc["imported"]
                newDoc["team"] = team
                newDoc["crawler"] = crawler

                contentType = getContentType(ccaDoc)
                newDoc["content_type"] = contentType

                parsed = parser.from_buffer(ccaDoc["response"]["body"].encode("utf-8"))
                newDoc["crawl_data"] = {}
                if "content" in parsed:
                    newDoc["crawl_data"]["content"] = parsed["content"]
                    newDoc["extracted_text"] = parsed["content"]
                if 'inlinks' in ccaDoc and ccaDoc['inlinks']:
                    newDoc["crawl_data"]["obj_parents"] = ccaDoc['inlinks']
                    newDoc["obj_parent"] = ccaDoc['inlinks'][0]
                # CDR version 2.0 additions
                newDoc["_id"] = ccaDoc["key"]
                newDoc["obj_original_url"] = ccaDoc["url"]

                if 'text' in contentType or 'ml' in contentType:
                    # web page
                    newDoc["raw_content"] = ccaDoc["response"]["body"]
                    if "obj_parent" in newDoc:
                        del newDoc["obj_parent"]
                else:
                    # binary content, we link to store
                    # ideally we should be storing it both the cases, but the CDR schema decided this way
                    newDoc["obj_stored_url"] = url_to_nutch_dump_path(ccaDoc["url"], prefix=storeprefix)

                newDoc["extracted_metadata"] = parsed["metadata"] if 'metadata' in parsed else {}
                newDoc["version"] = CDRVersion

                # Validation Checks
                if "obj_parent" not in newDoc and ("raw_content" not in newDoc or not newDoc["raw_content"]):
                    failedList.append(f)
                    failedReasons.append("No Raw Content Found")
                    continue

                if "obj_parent" not in newDoc and len(newDoc["raw_content"]) < contentLengthThreshold:
                    failedList.append(f)
                    failedReasons.append("Raw Content is less than " + str(contentLengthThreshold) + " characters")
                    continue

                # Check for fail_keys
                if "obj_parent" not in newDoc and len(newDoc["raw_content"]) >= contentLengthThreshold:
                    is_fail = False
                    content_nopunct = remove_punctuation(newDoc["raw_content"])
                    content_list = content_nopunct.split(" ")
                    for list in fail_keys:
                        counter = 0
                        listlen = len(list)
                        for item in list:
                            if item in content_list:
                                counter += 1
                        if counter >= listlen:
                            is_fail = True
                    if "request ad could not be" in content_nopunct:
                        is_fail = True

                    if is_fail:
                        failedList.append(f)
                        failedReasons.append("Raw Content indicates failed crawl")
                        continue

                if "obj_parent" not in newDoc and ("extracted_text" not in newDoc or not newDoc["extracted_text"]):
                    failedList.append(f)
                    failedReasons.append("No Extracted Text Found")
                    continue

                verboseLog("Indexing ["+f+"] to Elasticsearch.")
                if url:
                    indexDoc(url, newDoc, index, docType)
                if outFile:
                    outFile.write(simplejson.dumps(newDoc))
                    outFile.write("\n")
                procCount += 1
            except Exception as err:
                failedList.append(f)
                failedReasons.append(str(err))
                traceback.print_exc()
    if outFile:
        print("Output Stored at %s" % outPath)
        outFile.close()
    print "Processed " + str(procCount) + " CBOR files successfully."
    print "Failed files: " + str(len(failedList))

    if _verbose:
        for i in range(len(failedList)):
            verboseLog("File: "+failedList[i]+" failed because "+failedReasons[i])
mergeJsonFile=open("/Users/charanshampur/PycharmProjects/assign2/Merged.json","r")
sweetJsonFile=open("Sweet.json","w")
jsonLoad=json.load(mergeJsonFile)

def getSweetNer(metaData):
    metaDataFormatted={}
    for key,value in metaData.items():
        if(re.match("NER",key)):
            metaDataFormatted[key]=value
    return metaDataFormatted

sweetJson=[]
for doc in jsonLoad:
    print doc["id"]
    sweetDict={}
    sweetDict.update(doc)
    if "content" in doc:
        parsedData=parser.from_buffer(doc["content"])
        if "metadata" in parsedData:
            meta=getSweetNer(parsedData["metadata"])
            if(len(meta)>0):
                sweetDict.update(meta)
    sweetJson.append(sweetDict)

json.dump(sweetJson,sweetJsonFile,indent=4)