Python TextExtractor Examples

Programming Language: Python

Namespace/Package Name: TextExtractor

Class/Type: TextExtractor

Examples at hotexamples.com: 9

Python TextExtractor - 9 examples found. These are the top rated real world Python examples of TextExtractor.TextExtractor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TextExtractor(7)

extract(2)

extract_text(2)

format(1)

get_all_words(1)

get_path_params(1)

process_text(1)

save(1)

Example #1

Show file

        def real_progress():
            self.progress.grid(row=1, column=1, sticky='e')
            self.progress.start()
            global input_dir
            if input_dir.get() != "":
                tc = TextExtractor(file_name)
                tc.process_text()
            time.sleep(5)
            self.progress.stop()
            self.progress.grid_forget()
            input_dir.delete(0, END)

            self.btn['state'] = 'normal'

Example #2

Show file

File: main_intervals.py Project: shaw-rohit/wdps1805

def process_page(row: tuple):
    _, web_arch_record = row
    if not web_arch_record:
        return
    logger.info('Processing a warc record...')
    warc_record = WarcRecord(web_arch_record)
    warc_payload = warc_record.payload
    if warc_record.broken:
        return
    words = TextExtractor.get_all_words(warc_payload)
    words = [word for word in words if word not in stopwords.words('english')]
    ners = get_continuous_chunks(re.sub('<[^>]*>', ' ', warc_payload))

    context_size = 10
    canonical_labels_of_ids = dict()
    related_ids_of_ids = dict()
    ids_of_words = defaultdict(list)
    words_wo_repititions = []
    # Build sets
    for word in words:
        es_results = es.search(word)
        repeated = False
        for es_result in es_results:
            if canonical_labels_of_ids.get(es_result.id) is not None:
                # We are not interested in labels with repeating freebase ids,
                # so we just skip them
                repeated = True
                break
        if repeated:
            continue
        for es_result in es_results:
            if canonical_labels_of_ids.get(es_result.id) is not None:
                continue
            ids_of_words[word].append(es_result.id)
            canonical_labels_of_ids[es_result.id] = es_result.label
            related_ids_of_ids[es_result.id] = set(i for i in sparql.search(es_result.id))
        words_wo_repititions.append(word)

    # Calc links

    for i, word in enumerate(words_wo_repititions):
        if word in ners:
            max_common = -1
            id_with_max_common = None
            for j in range(i - context_size, i + context_size + 1):
                if j != i and j in range(0, len(words_wo_repititions)):
                    for freebase_id in ids_of_words[word]:
                        for other_id in ids_of_words[words_wo_repititions[j]]:
                            common = len(related_ids_of_ids[other_id].intersection(related_ids_of_ids[freebase_id]))
                            if common > max_common:
                                max_common = common
                                id_with_max_common = freebase_id
            if id_with_max_common is None:
                # No label intersects with anything, let's just pick the first one
                id_with_max_common = ids_of_words[word][0]
            label_with_max_common = canonical_labels_of_ids[id_with_max_common]
            yield stringify_reply(warc_record.id, word, id_with_max_common)
    logger.info('Processed record %s', warc_record.id)

Example #3

Show file

File: QuizGenServer.py Project: smishra2/QuizGen

def extract():
    if request.args.get('websiteURL') is None:
        abort(500)

    #Extract text
    print("Received quizme GET request, website URL = " +
          str(request.args.get('websiteURL', '')))
    extractor = TextExtractor(request.args.get('websiteURL', ''))
    extractor.extract()

    #Peform java processing
    subprocess.check_call("gradle run", shell=True)

    # Return result from file
    f = open('res/Output', 'r')
    result = f.read()
    f.close()
    return result

Example #4

Show file

File: Hogagim.py Project: Zodon/Hogigim

 def start_threads():
     cache_dict.clear()
     new_threads = [
         T(q, cache_dict, stop_event, speech_queue) for T in SPEECH_THREADS
     ]
     new_threads.append(TextExtractor(q, cache_dict, stop_event))
     # new_threads.append(ConsoleReader(q, cache_dict, stop_event))
     for t in new_threads:
         t.start()
     threads.extend(new_threads)
     return jsonify({'success': True})

Example #5

Show file

File: Image-to-text.py Project: centre-for-humanities-computing/Meme-to-text

    def __init__(self,
                 input_dir="",
                 extract_text=True,
                 correct_spelling=False,
                 extract_meme_context=False,
                 output_dir=""):
        TEX = TextExtractor()
        files = pathlib.Path(input_dir).glob("*")
        files = [x for x in files if x.is_file()]
        image_labels = TEX.extract_text(files)

        if correct_spelling:
            for img in image_labels:
                if not len(img.ocr) > 0:
                    continue
                check = Speller(lang='en')
                wordsegment.load()
                ocr = wordsegment.segment(img.ocr)
                ocr = " ".join(ocr)
                ocr = check(ocr)
                img.ocr = ocr

        for image in image_labels:
            image.save_to(output_dir)

Example #6

Show file

File: __init__.py Project: Bhanditz/scribes-goodies

    def __init__(self, editor):
        self.editor = editor

        self.signals = Signals()
        self.triggers = TriggerManager(editor)
        connect_all(self, self.signals, self.triggers, textbuffer=self.editor.textbuffer)

        self.block_word_reset = False
        self.words = None
        self.start_word = None
        self.start_offset = None
        
        self.indexer = IndexerProcessManager(self.signals.sender, editor)
        self.dictionary_manager = DictionaryManager(self.signals.sender, editor)
        self.communicator = ProcessCommunicator(self.signals.sender, editor)
        self.extractor = TextExtractor(self.signals.sender, editor)
        self.buffer_monitor = BufferMonitor(self.signals.sender, editor)

Example #7

Show file

__author__ = 'liuqiang'

import chardet, urllib2

from TextExtractor import TextExtractor
from Charsetdet import charsetdet

te = TextExtractor()

f = urllib2.urlopen('http://www.bbc.com/news/world-asia-china-33728654')
#f = urllib2.urlopen('http://www.sxdaily.com.cn/n/2015/0731/c142-5720819-4.html')

html = f.read()

#print a
#ec = chardet.detect(a)

code = charsetdet(html)

if code == None:
    code = chardet.detect(html)

c = html.decode(code)

print te.extract(c)

Example #8

Show file

import argparse
from TextExtractor import TextExtractor
from QuizGenerator import QuizGenerator

if __name__ == '__main__':
    # TODO: Clarify description.

    # Parse args.
    parser = argparse.ArgumentParser(
        description='Generate quiz from given pdf file.')
    parser.add_argument('file_path', type=str, help='file_path')
    # parser.add_argument('quiz_cnt', type=int, help='quiz_cnt')
    args = parser.parse_args()

    file_path = args.file_path
    # quiz_cnt = args.quiz_cnt

    # Extract text from pdf.
    extractor = TextExtractor(file_path)
    text = extractor.extract_text()

    # Generate quiz from text.
    quizgenerator = QuizGenerator(text)
    problem, answer = quizgenerator.generate_quiz()

    # Print `problem|output` to stdout.
    res = problem + "|" + answer
    print(res)

Example #9

Show file

File: main.py Project: gandalvab/Tensor

# -*- coding: utf-8 -*-
import os
import sys
from TextExtractor import TextExtractor, Template

if len(sys.argv) < 2:
    print('No URL')
    exit(1)

url = sys.argv[1]

try:
    _, _, site = TextExtractor.get_path_params(url)
    template = Template()
    file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates', site)
    print('Loading template from file', file)
    if not template.load(file):
        print('Template not loaded')

    extractor = TextExtractor()
    extractor.extract(url, template)
    count = extractor.format()
    extractor.save()
except Exception as err:
    print('Error', err.args)
else:
    print(count, 'line(s) saved to file ', os.sep.join(TextExtractor.get_path_params(url)[0:2]))