Example #1
0
        def real_progress():
            self.progress.grid(row=1, column=1, sticky='e')
            self.progress.start()
            global input_dir
            if input_dir.get() != "":
                tc = TextExtractor(file_name)
                tc.process_text()
            time.sleep(5)
            self.progress.stop()
            self.progress.grid_forget()
            input_dir.delete(0, END)

            self.btn['state'] = 'normal'
Example #2
0
def process_page(row: tuple):
    _, web_arch_record = row
    if not web_arch_record:
        return
    logger.info('Processing a warc record...')
    warc_record = WarcRecord(web_arch_record)
    warc_payload = warc_record.payload
    if warc_record.broken:
        return
    words = TextExtractor.get_all_words(warc_payload)
    words = [word for word in words if word not in stopwords.words('english')]
    ners = get_continuous_chunks(re.sub('<[^>]*>', ' ', warc_payload))

    context_size = 10
    canonical_labels_of_ids = dict()
    related_ids_of_ids = dict()
    ids_of_words = defaultdict(list)
    words_wo_repititions = []
    # Build sets
    for word in words:
        es_results = es.search(word)
        repeated = False
        for es_result in es_results:
            if canonical_labels_of_ids.get(es_result.id) is not None:
                # We are not interested in labels with repeating freebase ids,
                # so we just skip them
                repeated = True
                break
        if repeated:
            continue
        for es_result in es_results:
            if canonical_labels_of_ids.get(es_result.id) is not None:
                continue
            ids_of_words[word].append(es_result.id)
            canonical_labels_of_ids[es_result.id] = es_result.label
            related_ids_of_ids[es_result.id] = set(i for i in sparql.search(es_result.id))
        words_wo_repititions.append(word)

    # Calc links

    for i, word in enumerate(words_wo_repititions):
        if word in ners:
            max_common = -1
            id_with_max_common = None
            for j in range(i - context_size, i + context_size + 1):
                if j != i and j in range(0, len(words_wo_repititions)):
                    for freebase_id in ids_of_words[word]:
                        for other_id in ids_of_words[words_wo_repititions[j]]:
                            common = len(related_ids_of_ids[other_id].intersection(related_ids_of_ids[freebase_id]))
                            if common > max_common:
                                max_common = common
                                id_with_max_common = freebase_id
            if id_with_max_common is None:
                # No label intersects with anything, let's just pick the first one
                id_with_max_common = ids_of_words[word][0]
            label_with_max_common = canonical_labels_of_ids[id_with_max_common]
            yield stringify_reply(warc_record.id, word, id_with_max_common)
    logger.info('Processed record %s', warc_record.id)
Example #3
0
def extract():
    if request.args.get('websiteURL') is None:
        abort(500)

    #Extract text
    print("Received quizme GET request, website URL = " +
          str(request.args.get('websiteURL', '')))
    extractor = TextExtractor(request.args.get('websiteURL', ''))
    extractor.extract()

    #Peform java processing
    subprocess.check_call("gradle run", shell=True)

    # Return result from file
    f = open('res/Output', 'r')
    result = f.read()
    f.close()
    return result
Example #4
0
 def start_threads():
     cache_dict.clear()
     new_threads = [
         T(q, cache_dict, stop_event, speech_queue) for T in SPEECH_THREADS
     ]
     new_threads.append(TextExtractor(q, cache_dict, stop_event))
     # new_threads.append(ConsoleReader(q, cache_dict, stop_event))
     for t in new_threads:
         t.start()
     threads.extend(new_threads)
     return jsonify({'success': True})
    def __init__(self,
                 input_dir="",
                 extract_text=True,
                 correct_spelling=False,
                 extract_meme_context=False,
                 output_dir=""):
        TEX = TextExtractor()
        files = pathlib.Path(input_dir).glob("*")
        files = [x for x in files if x.is_file()]
        image_labels = TEX.extract_text(files)

        if correct_spelling:
            for img in image_labels:
                if not len(img.ocr) > 0:
                    continue
                check = Speller(lang='en')
                wordsegment.load()
                ocr = wordsegment.segment(img.ocr)
                ocr = " ".join(ocr)
                ocr = check(ocr)
                img.ocr = ocr

        for image in image_labels:
            image.save_to(output_dir)
Example #6
0
    def __init__(self, editor):
        self.editor = editor

        self.signals = Signals()
        self.triggers = TriggerManager(editor)
        connect_all(self, self.signals, self.triggers, textbuffer=self.editor.textbuffer)

        self.block_word_reset = False
        self.words = None
        self.start_word = None
        self.start_offset = None
        
        self.indexer = IndexerProcessManager(self.signals.sender, editor)
        self.dictionary_manager = DictionaryManager(self.signals.sender, editor)
        self.communicator = ProcessCommunicator(self.signals.sender, editor)
        self.extractor = TextExtractor(self.signals.sender, editor)
        self.buffer_monitor = BufferMonitor(self.signals.sender, editor)
Example #7
0
__author__ = 'liuqiang'

import chardet, urllib2

from TextExtractor import TextExtractor
from Charsetdet import charsetdet

te = TextExtractor()

f = urllib2.urlopen('http://www.bbc.com/news/world-asia-china-33728654')
#f = urllib2.urlopen('http://www.sxdaily.com.cn/n/2015/0731/c142-5720819-4.html')

html = f.read()

#print a
#ec = chardet.detect(a)

code = charsetdet(html)

if code == None:
    code = chardet.detect(html)

c = html.decode(code)

print te.extract(c)
Example #8
0
import argparse
from TextExtractor import TextExtractor
from QuizGenerator import QuizGenerator

if __name__ == '__main__':
    # TODO: Clarify description.

    # Parse args.
    parser = argparse.ArgumentParser(
        description='Generate quiz from given pdf file.')
    parser.add_argument('file_path', type=str, help='file_path')
    # parser.add_argument('quiz_cnt', type=int, help='quiz_cnt')
    args = parser.parse_args()

    file_path = args.file_path
    # quiz_cnt = args.quiz_cnt

    # Extract text from pdf.
    extractor = TextExtractor(file_path)
    text = extractor.extract_text()

    # Generate quiz from text.
    quizgenerator = QuizGenerator(text)
    problem, answer = quizgenerator.generate_quiz()

    # Print `problem|output` to stdout.
    res = problem + "|" + answer
    print(res)
Example #9
0
# -*- coding: utf-8 -*-
import os
import sys
from TextExtractor import TextExtractor, Template

if len(sys.argv) < 2:
    print('No URL')
    exit(1)

url = sys.argv[1]

try:
    _, _, site = TextExtractor.get_path_params(url)
    template = Template()
    file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates', site)
    print('Loading template from file', file)
    if not template.load(file):
        print('Template not loaded')

    extractor = TextExtractor()
    extractor.extract(url, template)
    count = extractor.format()
    extractor.save()
except Exception as err:
    print('Error', err.args)
else:
    print(count, 'line(s) saved to file ', os.sep.join(TextExtractor.get_path_params(url)[0:2]))