def test_add_two_text_models(self): """Test that adding two text models works.""" text1 = Text(body="Jabba no watta.", sender="them") text1.save() text2 = Text(body="Too Nakma Noya Solo!", sender="you") text2.save() self.assertTrue(Text.objects.count() == 2)
def handle_files(save = False): files = get_filenames() bar = pb.ProgressBar() bar(range(len(files))) for i,filename in enumerate(files): bar.update(i) language_folder= find_language_folder(filename) languages = language_dict[language_folder] multiple_languages = True if len(languages) > 1 else False main_language = languages[0] if len(languages) == 1 else None filetype = filename.split('.')[-1] incorrect_ft= False if filetype.lower() in 'doc,docx,pdf,txt,rtf'.split(',') else True if incorrect_ft: continue try: raw_text = textract.process(filename).decode() error = False except: raw_text = '' error = True t = Text(filename = filename, filetype = filetype, raw_text = raw_text, main_language = main_language, multiple_languages = multiple_languages, source = source, text_type = texttype, error = error) if save: try:t.save() except: print('could not save:',t) print(sys.exc_info()) continue for language in languages: t.all_languages.add(language)
def test_most_recent_text_body(self): """test that most_recent_text_body helper function returns the right text body.""" contact = Contact(name="test", number='+12345678910') contact.save() text = Text(body="this is a test", sender="Them", contact=contact) text.save() self.assertEqual(contact.most_recent_text_body(), text.body)
def load_cgn_in_database(cgn_transcriptions=None, save=False): if not cgn_transcriptions: cgn_transcriptions = make_cgn_transcriptions() cgn_source = Source.objects.get(name='cgn') text_type = TextType.objects.get(name='manual transcription') output = [] bar = pb.ProgressBar() bar(range(len(cgn_transcriptions))) for i, t in enumerate(cgn_transcriptions): bar.update(i) error = t.get_bracket_error or t.bracket_error or t.tag_error o = Text(filetype='txt', raw_text=t.text, transcription_meta=t.line, main_language=t.language, source=cgn_source, text_type=text_type, start_time=t.start, end_time=t.end, wav_filename=t.wav, multiple_languages=False, error=error, file_id=t.file_id, speaker_id=t.line['speaker_id'], speaker_gender=t.line['gender']) if save: o.save() output.append(o) return output
def add_transcription(t, save=False, source=council_source, check_db=True): error = t.get_bracket_error or t.bracket_error or t.tag_error multiple_languages = True if len(t.languages) > 1 else False if check_db: o = Text.objects.filter(start_time=t.start, end_time=t.end, wav_filename=t.wav) if o: print( 'transcription already stored, returning object from database') return o o = Text(filetype='txt', raw_text=t.text, transcription_meta=t.line, main_language=t.language, source=source, text_type=text_type, start_time=t.start, end_time=t.end, wav_filename=t.wav, multiple_languages=multiple_languages, error=error, file_id=t.file_id) if not save: return o try: o.save() except: print('could not save:', t) print(sys.exc_info()) else: for language in t.languages: o.all_languages.add(language) return o
def load_frisian_minutes_in_db(d = None, save = False): '''set of pdf's scanned by jelske.''' source = Source.objects.get(name='frisian council minutes') texttype= TextType.objects.get(name='council notes') output = [] language_dict = {'Frisian':frisian,'Dutch':dutch} c = ld.load('Dutch-Frisian_sentences') if not d: d = make_text_frisian_minutes() for f,text in d.items(): print(f) t = Text.objects.filter(filename__exact=f) if t: print(f.split('/')[-1],'already found in database',9)#t) output.append(t) continue o = c.predict_text(text) main_language = language_dict[o.main_language_overall_prediction] multiple_languages = True t = Text(filename = f, filetype = 'pdf', source = source, text_type = texttype, raw_text = text,main_language = main_language, multiple_languages = multiple_languages) output.append(t) if save: try:t.save() except: print('could not save:',10)#t) print(sys.exc_info()) continue for language in [frisian,dutch]: t.all_languages.add(language) return output
def test_text_queryset_returns_text_body_on_page(self): """Test that a call to the text api contains actually body content.""" user1 = User() user1.save() self.client.force_login(user1) text1 = Text(body="Jabba no watta.", sender="them") text1.save() text2 = Text(body="I'm telling you, Jabba, I can get the money.") text2.save() text3 = Text(body="Solo! Solo! Too Nakma Noya Solo!", sender="them") text3.save() texts = self.client.get('/api/texts/') self.assertTrue("Jabba no watta" in texts.content.decode()) self.assertTrue("I'm telling you, Jabba, I can get the money." in texts.content.decode()) self.assertTrue( "Solo! Solo! Too Nakma Noya Solo!" in texts.content.decode())
def test_text_queryset_returns_sender_attribute_in_json(self): """Test that a call to the text api contains the sender attribute.""" user1 = User() user1.save() self.client.force_login(user1) text1 = Text(body="Jabba no watta.", sender="them") text1.save() texts = self.client.get('/api/texts/') self.assertTrue("sender" in texts.content.decode()) self.assertTrue("them" in texts.content.decode())
def test_most_recent_text_body_longer_body(self): """Test helper function returns truncated text body.""" contact = Contact(name="test", number='+12345678910') contact.save() text = Text(body="this is a sample text for testing", sender="Them", contact=contact) text.save() truncated_text = "this is a sample tex..." self.assertEqual(contact.most_recent_text_body(), truncated_text)
def test_last_text_view_returns_latest_incoming_text(self): """LastText should return latest incoming text.""" view = LastText.as_view() user1 = User() user1.save() self.client.force_login(user1) text1 = Text(body="Jabba no watta.", sender="them") text1.save() request = self.request.get('/sf') request.user = user1 response = view(request) self.assertIn('Jabba no watta', response.rendered_content.decode()) text2 = Text(body="this shouldn't show up.", sender="you") text2.save() response = view(request) self.assertIn('Jabba no watta', response.rendered_content.decode()) text3 = Text(body="Not the same.", sender="them") text3.save() response = view(request) self.assertIn('Not the same', response.rendered_content.decode())
def test_text_queryset_is_all_texts(self): """Text view should show all texts.""" user1 = User() user1.save() self.client.force_login(user1) texts = self.client.get('/api/texts/') self.assertEqual(len(texts.json()), 0) text1 = Text(body="Jabba no watta.", sender="them") text1.save() texts = self.client.get('/api/texts/') self.assertEqual(len(texts.json()), 1)
def test_text_view_template(self): """Test that text view uses texts template.""" user1 = User() user1.save() self.client.force_login(user1) text1 = Text(body="Jabba no watta.", sender="them", contact=self.contacts[0]) text1.save() response = self.client.get( reverse_lazy('texts', kwargs={"pk": self.contacts[0].id})) self.assertTemplateUsed(response, 'texts/texting.html')
def test_text_view_status_200(self): """Test that text view returns ok status.""" user1 = User() user1.save() self.client.force_login(user1) text1 = Text(body="Jabba no watta.", sender="them", contact=self.contacts[0]) text1.save() response = self.client.get( reverse_lazy('texts', kwargs={"pk": self.contacts[0].id})) self.assertTrue(response.status_code == 200)
def test_api_last_text_view_status_ok(self): """Test api contacts view is status ok.""" user1 = User() user1.save() self.client.force_login(user1) request = self.request.get('/sf') request.user = user1 text1 = Text(body="Jabba no watta.", sender="them") text1.save() view = LastText.as_view() response = view(request) self.assertEqual(response.status_code, 200)
def post(self, request, *kwargs): parser = FormParser() query_dict = parser.parse(request) contact = Contact.objects.filter(number=query_dict["From"]).first() if not contact: contact = Contact(number=query_dict["From"]) contact.save() if contact.number != os.environ["TWILIO_NUMBER"]: sender = "them" else: sender = "you" text = Text(sender=sender, contact=contact, body=query_dict["Body"]) text.save() return HttpResponse()
def handle_text_xml(t): clean_text = clean(t.text) title = text_xml2title(t) file_id = text_xml2id(t) t = Text.objects.filter(file_id=file_id, title=title) if t: print('already found wiki entry in database') else: t = Text(clean_text=clean_text, title=title, file_id=file_id, main_language=frisian, source=source, text_type=text_type) t.save() t.all_languages.add(frisian) return t
def done(self, form_list, **kwargs): # Handle upload from local file. if form_list[0].cleaned_data['method'] == 'local': content = form_list[1].cleaned_data['upload'].read() filename = form_list[1].cleaned_data['upload'].name length = len(content) title = form_list[1].cleaned_data['title'] uri = form_list[1].cleaned_data['uri'] dateCreated = form_list[1].cleaned_data['dateCreated'] dateDigitized = form_list[1].cleaned_data['dateDigitized'] creator = form_list[1].cleaned_data['creator'] text = Text(uri=uri, title=title, dateCreated=dateCreated, dateDigitized=dateDigitized, content=content, filename=filename, length=length) text.save() for c in creator: text.creator.add(c.id) text.save() # Handle selection of remote files. elif form_list[0].cleaned_data['method'] == 'remote': from pprint import pprint repo = form_list[1].cleaned_data['repository'] coll = form_list[2].cleaned_data['collection'] text = [] for item in list_items(repo, coll): t = handle_item(repo, item) if t is not None: text.append(t) return render_to_response( 'texts/done.html', { 'form_data': [form.cleaned_data for form in form_list], 'text': text })
def handle_item(repo, item): cred = repo.credential manager = RepositoryManager(cred) # Ignore items without bitstreams. if item['primary_bitstream'] in [ None, '-1' ]: return None try: text = Text.objects.get(uri=item['uri']) exists = True except Text.DoesNotExist: exists = False if not exists: # Get bitstream. bitstream = manager.get_bitstream(item['primary_bitstream']) # Get Creators. creators = [] for creator in item['creators']: creators.append(retrieve_concept(creator)) text = Text( uri = item['uri'], title = item['title'], dateCreated = handle_date(item['dateCreated']), dateDigitized = handle_date(item['dateDigitized']), content = bitstream, filename = item['uri'], length = len(bitstream) ) text.save() for creator in creators: text.creator.add(creator) text.save() return text return None
def handle(self, *args, **options): source_dir = options['source_dir'] dir_list = next(os.walk(source_dir))[1] base = options['base_texts'] texts = {} base_texts = {} #filepaths to base texts working_witnesses = {} # witnesses that are classes as a base text base_witnesses = {} # witnesses that the base was copied from sources = {} for source in Source.objects.all(): sources[source.name] = source # create base source and witness working_source, _ = Source.objects.get_or_create( name=WORKING_SOURCE_NAME, is_working=True, ) # make sure base text is the first witness processed sorted_dir_list = [] for dir in dir_list: if dir == base: sorted_dir_list.insert(0, dir) else: sorted_dir_list.append(dir) for dir in sorted_dir_list: full_dir = os.path.join(source_dir, dir) if dir == base: is_base = True else: is_base = False if dir not in sources: source = Source.objects.create( name=dir, is_base=is_base ) sources[dir] = source else: source = sources[dir] files = next(os.walk(full_dir))[2] for filename in files: if filename[0] == '.': continue filepath = os.path.join(full_dir, filename) if 'layout' in filename: continue else: text_name = os.path.splitext(filename)[0] if text_name not in texts: text = Text() text.name = text_name text.save() texts[text_name] = text else: text = texts[text_name] witness = Witness() witness.text = text witness.source = source witness.save() if is_base: working_witness = Witness() working_witness.text = text working_witness.source = working_source with open(filepath, 'r') as file: content = file.read() working_witness.content = content working_witness.save() base_texts[text_name] = filepath # base_path = filepath working_witnesses[text_name] = working_witness base_witnesses[text_name] = witness # there won't be any annotations for the base witness clone # or the base witness itself continue else: base_path = base_texts[text_name] # if is_base: # base_witnesses[text_name] = base_witness # continue working_witness = working_witnesses[text_name] command_args = f'--start-delete="|-" --stop-delete="-/" --aggregate-changes -d "ཿ།།༌་ \n" "{base_path}" "{filepath}"' command = f"dwdiff {command_args}" try: diff = subprocess.run(shlex.split(command), stdout=subprocess.PIPE, encoding='utf-8').stdout except Exception as e: print(e) try: annotations = parse_word_diff(diff) except Exception as e: annotations = [] print(f'dir: {dir}, filename: {filename}') for annotation_data in annotations: annotation = Annotation() annotation.witness = working_witness annotation.start = annotation_data['start'] annotation.length = annotation_data['length'] annotation.content = annotation_data['replacement'] annotation.creator_witness = witness annotation.save() for filename in files: filepath = os.path.join(full_dir, filename) if 'layout' not in filename: continue text_name = os.path.splitext(filename)[0].replace('_layout', '') # for now, assume page breaks are only for the base witness base_origin_witness = base_witnesses[text_name] working_witness = working_witnesses[text_name] with open(filepath, 'r') as file: content = file.read() pb_count = 0 page_breaks = parse_layout_data(content) for page_break in page_breaks: pb_count += 1 annotation = Annotation() annotation.witness = working_witness annotation.start = page_break annotation.length = 0 annotation.content = "" annotation.creator_witness = base_origin_witness annotation.type = AnnotationType.page_break.value annotation.save()
def fetch_texts(ingest_id): from texts.models import Corpus, Text from annis.models import AnnisServer # Define HTML Formats and the ANNIS server to query annis_server = AnnisServer.objects.all()[:1] if annis_server: annis_server = annis_server[0] if not annis_server.base_domain.endswith("/"): annis_server.base_domain += "/" else: logger.error("No ANNIS server found") return False ingest = _retry_getting_ingest(ingest_id) if not ingest: logger.error('Ingest with ID %d not found in database' % ingest_id) return logger.info("Starting virtual framebuffer") vdisplay = Xvfb() try: vdisplay.start() except Exception as e: logger.error('Unable to start Xvfb: %s' % e) ingesting_corpora = Corpus.objects.filter( id__in=(ingest.corpora.values_list('id', flat=True))) try: for corpus in ingesting_corpora: corpus_name = corpus.annis_corpus_name logger.info('Importing corpus ' + corpus.title) doc_names_url = annis_server.url_corpus_docname(corpus_name) doc_titles = [ fields[0] for fields in get_selected_annotation_fields( doc_names_url, ('name', )) ] logger.info('%d documents found for corpus %s: %s' % (len(doc_titles), corpus_name, ', '.join(doc_titles))) for title in doc_titles: logger.info('Importing ' + title) Text.objects.filter(title=title).delete() text = Text() text.title = title text.slug = slugify(title).__str__() text.corpus = corpus text.ingest = ingest text.save() doc_meta_url = annis_server.url_document_metadata( corpus_name, text.title) metadata.collect_text_meta(doc_meta_url, text) vis.collect(corpus, text, annis_server) ingest.num_texts_ingested += 1 ingest.save() ingest.num_corpora_ingested += 1 ingest.save() except VisServerRefusingConn: logger.error( 'Aborting ingestion because visualization server repeatedly refused connections' ) vdisplay.stop() logger.info('Finished')
def handle(self, *args, **options): csv_filepath = options['csv_file'] topics = {} texts = {} authors = {} stored_texts = Text.objects.all() for text in stored_texts: texts[text.code] = text stored_topics = Topic.objects.all() for topic in stored_topics: topics[topic.name] = topic stored_authors = Author.objects.all() for author in stored_authors: authors[author.name] = author with open(csv_filepath) as csv_file: csv_reader = csv.reader(csv_file) for row in csv_reader: text_code = row[self.TEXT_CODE].strip().lower() if not re.match(r'^[a-z][\d]+$', text_code): continue title = row[self.TITLE] topic_name = row[self.TOPIC] author_name = row[self.AUTHOR] if text_code not in texts: text = Text() text.name = title text.code = text_code text.save() texts[text_code] = text else: text = texts[text_code] if topic_name not in topics: topic = Topic() topic.name = topic_name topic.save() topics[topic_name] = topic else: topic = topics[topic_name] if not text.topics.filter(id=topic.id).exists(): text.topics.add(topic) text.save() if author_name not in authors: author = Author() author.name = author_name author.save() authors[author_name] = author else: author = authors[author_name] if not text.author or text.author != author: text.author = author text.save() else: print(f"author already saved: {author_name}, {author.id}, {text.author.id}")
def test_add_text_model(self): """Test that adding a text model works.""" text1 = Text(body="Jabba no watta.", sender="them") text1.save() self.assertTrue(Text.objects.count() == 1)
def test_image_body(self): """Test that text instance has correct body.""" text1 = Text(body="Jabba no watta.", sender="them") text1.save() self.assertTrue(Text.objects.first().body == "Jabba no watta.")
def test_image_time_format(self): """Test that text instance has correct time format.""" text1 = Text(body="Jabba no watta.", sender="them") text1.save() self.assertTrue( isinstance(Text.objects.first().time, datetime.datetime))
def test_text_repr_is_body(self): """Test that texts are properly represented.""" text1 = Text(body="No bata tu tu, muni, muni.", sender="them") text1.save() self.assertTrue(text1.__str__() == text1.body[:20])