def real_progress(): self.progress.grid(row=1, column=1, sticky='e') self.progress.start() global input_dir if input_dir.get() != "": tc = TextExtractor(file_name) tc.process_text() time.sleep(5) self.progress.stop() self.progress.grid_forget() input_dir.delete(0, END) self.btn['state'] = 'normal'
def process_page(row: tuple): _, web_arch_record = row if not web_arch_record: return logger.info('Processing a warc record...') warc_record = WarcRecord(web_arch_record) warc_payload = warc_record.payload if warc_record.broken: return words = TextExtractor.get_all_words(warc_payload) words = [word for word in words if word not in stopwords.words('english')] ners = get_continuous_chunks(re.sub('<[^>]*>', ' ', warc_payload)) context_size = 10 canonical_labels_of_ids = dict() related_ids_of_ids = dict() ids_of_words = defaultdict(list) words_wo_repititions = [] # Build sets for word in words: es_results = es.search(word) repeated = False for es_result in es_results: if canonical_labels_of_ids.get(es_result.id) is not None: # We are not interested in labels with repeating freebase ids, # so we just skip them repeated = True break if repeated: continue for es_result in es_results: if canonical_labels_of_ids.get(es_result.id) is not None: continue ids_of_words[word].append(es_result.id) canonical_labels_of_ids[es_result.id] = es_result.label related_ids_of_ids[es_result.id] = set(i for i in sparql.search(es_result.id)) words_wo_repititions.append(word) # Calc links for i, word in enumerate(words_wo_repititions): if word in ners: max_common = -1 id_with_max_common = None for j in range(i - context_size, i + context_size + 1): if j != i and j in range(0, len(words_wo_repititions)): for freebase_id in ids_of_words[word]: for other_id in ids_of_words[words_wo_repititions[j]]: common = len(related_ids_of_ids[other_id].intersection(related_ids_of_ids[freebase_id])) if common > max_common: max_common = common id_with_max_common = freebase_id if id_with_max_common is None: # No label intersects with anything, let's just pick the first one id_with_max_common = ids_of_words[word][0] label_with_max_common = canonical_labels_of_ids[id_with_max_common] yield stringify_reply(warc_record.id, word, id_with_max_common) logger.info('Processed record %s', warc_record.id)
def extract(): if request.args.get('websiteURL') is None: abort(500) #Extract text print("Received quizme GET request, website URL = " + str(request.args.get('websiteURL', ''))) extractor = TextExtractor(request.args.get('websiteURL', '')) extractor.extract() #Peform java processing subprocess.check_call("gradle run", shell=True) # Return result from file f = open('res/Output', 'r') result = f.read() f.close() return result
def start_threads(): cache_dict.clear() new_threads = [ T(q, cache_dict, stop_event, speech_queue) for T in SPEECH_THREADS ] new_threads.append(TextExtractor(q, cache_dict, stop_event)) # new_threads.append(ConsoleReader(q, cache_dict, stop_event)) for t in new_threads: t.start() threads.extend(new_threads) return jsonify({'success': True})
def __init__(self, input_dir="", extract_text=True, correct_spelling=False, extract_meme_context=False, output_dir=""): TEX = TextExtractor() files = pathlib.Path(input_dir).glob("*") files = [x for x in files if x.is_file()] image_labels = TEX.extract_text(files) if correct_spelling: for img in image_labels: if not len(img.ocr) > 0: continue check = Speller(lang='en') wordsegment.load() ocr = wordsegment.segment(img.ocr) ocr = " ".join(ocr) ocr = check(ocr) img.ocr = ocr for image in image_labels: image.save_to(output_dir)
def __init__(self, editor): self.editor = editor self.signals = Signals() self.triggers = TriggerManager(editor) connect_all(self, self.signals, self.triggers, textbuffer=self.editor.textbuffer) self.block_word_reset = False self.words = None self.start_word = None self.start_offset = None self.indexer = IndexerProcessManager(self.signals.sender, editor) self.dictionary_manager = DictionaryManager(self.signals.sender, editor) self.communicator = ProcessCommunicator(self.signals.sender, editor) self.extractor = TextExtractor(self.signals.sender, editor) self.buffer_monitor = BufferMonitor(self.signals.sender, editor)
__author__ = 'liuqiang' import chardet, urllib2 from TextExtractor import TextExtractor from Charsetdet import charsetdet te = TextExtractor() f = urllib2.urlopen('http://www.bbc.com/news/world-asia-china-33728654') #f = urllib2.urlopen('http://www.sxdaily.com.cn/n/2015/0731/c142-5720819-4.html') html = f.read() #print a #ec = chardet.detect(a) code = charsetdet(html) if code == None: code = chardet.detect(html) c = html.decode(code) print te.extract(c)
import argparse from TextExtractor import TextExtractor from QuizGenerator import QuizGenerator if __name__ == '__main__': # TODO: Clarify description. # Parse args. parser = argparse.ArgumentParser( description='Generate quiz from given pdf file.') parser.add_argument('file_path', type=str, help='file_path') # parser.add_argument('quiz_cnt', type=int, help='quiz_cnt') args = parser.parse_args() file_path = args.file_path # quiz_cnt = args.quiz_cnt # Extract text from pdf. extractor = TextExtractor(file_path) text = extractor.extract_text() # Generate quiz from text. quizgenerator = QuizGenerator(text) problem, answer = quizgenerator.generate_quiz() # Print `problem|output` to stdout. res = problem + "|" + answer print(res)
# -*- coding: utf-8 -*- import os import sys from TextExtractor import TextExtractor, Template if len(sys.argv) < 2: print('No URL') exit(1) url = sys.argv[1] try: _, _, site = TextExtractor.get_path_params(url) template = Template() file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates', site) print('Loading template from file', file) if not template.load(file): print('Template not loaded') extractor = TextExtractor() extractor.extract(url, template) count = extractor.format() extractor.save() except Exception as err: print('Error', err.args) else: print(count, 'line(s) saved to file ', os.sep.join(TextExtractor.get_path_params(url)[0:2]))