コード例 #1
0
 def test_add_two_text_models(self):
     """Test that adding two text models works."""
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     text2 = Text(body="Too Nakma Noya Solo!", sender="you")
     text2.save()
     self.assertTrue(Text.objects.count() == 2)
コード例 #2
0
def handle_files(save = False):
	files = get_filenames()
	bar = pb.ProgressBar()
	bar(range(len(files)))
	for i,filename in enumerate(files):
		bar.update(i)
		language_folder= find_language_folder(filename)
		languages = language_dict[language_folder]
		multiple_languages = True if len(languages) > 1 else False
		main_language = languages[0] if len(languages) == 1 else None
		filetype = filename.split('.')[-1]
		incorrect_ft= False if filetype.lower() in 'doc,docx,pdf,txt,rtf'.split(',') else True
		if incorrect_ft: continue
		try: 
			raw_text = textract.process(filename).decode()
			error = False	
		except: 
			raw_text = ''
			error = True
		t = Text(filename = filename, filetype = filetype, raw_text = raw_text, 
			main_language = main_language, multiple_languages = multiple_languages, 
			source = source, text_type = texttype, error = error)
		if save:
			try:t.save()
			except:
				print('could not save:',t)
				print(sys.exc_info())
				continue
			for language in languages:
				t.all_languages.add(language)
コード例 #3
0
ファイル: tests.py プロジェクト: rwisecar/Pyphon
 def test_most_recent_text_body(self):
     """test that most_recent_text_body helper function returns the right text body."""
     contact = Contact(name="test", number='+12345678910')
     contact.save()
     text = Text(body="this is a test", sender="Them", contact=contact)
     text.save()
     self.assertEqual(contact.most_recent_text_body(), text.body)
コード例 #4
0
def load_cgn_in_database(cgn_transcriptions=None, save=False):
    if not cgn_transcriptions: cgn_transcriptions = make_cgn_transcriptions()
    cgn_source = Source.objects.get(name='cgn')
    text_type = TextType.objects.get(name='manual transcription')
    output = []
    bar = pb.ProgressBar()
    bar(range(len(cgn_transcriptions)))
    for i, t in enumerate(cgn_transcriptions):
        bar.update(i)
        error = t.get_bracket_error or t.bracket_error or t.tag_error
        o = Text(filetype='txt',
                 raw_text=t.text,
                 transcription_meta=t.line,
                 main_language=t.language,
                 source=cgn_source,
                 text_type=text_type,
                 start_time=t.start,
                 end_time=t.end,
                 wav_filename=t.wav,
                 multiple_languages=False,
                 error=error,
                 file_id=t.file_id,
                 speaker_id=t.line['speaker_id'],
                 speaker_gender=t.line['gender'])
        if save: o.save()
        output.append(o)
    return output
コード例 #5
0
def add_transcription(t, save=False, source=council_source, check_db=True):
    error = t.get_bracket_error or t.bracket_error or t.tag_error
    multiple_languages = True if len(t.languages) > 1 else False
    if check_db:
        o = Text.objects.filter(start_time=t.start,
                                end_time=t.end,
                                wav_filename=t.wav)
        if o:
            print(
                'transcription already stored, returning object from database')
            return o
    o = Text(filetype='txt',
             raw_text=t.text,
             transcription_meta=t.line,
             main_language=t.language,
             source=source,
             text_type=text_type,
             start_time=t.start,
             end_time=t.end,
             wav_filename=t.wav,
             multiple_languages=multiple_languages,
             error=error,
             file_id=t.file_id)
    if not save: return o
    try:
        o.save()
    except:
        print('could not save:', t)
        print(sys.exc_info())
    else:
        for language in t.languages:
            o.all_languages.add(language)
    return o
コード例 #6
0
def load_frisian_minutes_in_db(d = None, save = False):
	'''set of pdf's scanned by jelske.'''
	source = Source.objects.get(name='frisian council minutes')
	texttype= TextType.objects.get(name='council notes')
	output = []
	language_dict = {'Frisian':frisian,'Dutch':dutch}
	c = ld.load('Dutch-Frisian_sentences')
	if not d: d = make_text_frisian_minutes()
	for f,text in d.items():
		print(f)
		t = Text.objects.filter(filename__exact=f)
		if t:
			print(f.split('/')[-1],'already found in database',9)#t)
			output.append(t)
			continue
		o = c.predict_text(text)
		main_language = language_dict[o.main_language_overall_prediction]
		multiple_languages = True
		t = Text(filename = f, filetype = 'pdf', source = source, text_type = texttype,
			raw_text = text,main_language = main_language, multiple_languages = multiple_languages) 
		output.append(t)
		if save:
			try:t.save()
			except:
				print('could not save:',10)#t)
				print(sys.exc_info())
				continue
			for language in [frisian,dutch]:
				t.all_languages.add(language)
	return output
コード例 #7
0
 def test_text_queryset_returns_text_body_on_page(self):
     """Test that a call to the text api contains actually body content."""
     user1 = User()
     user1.save()
     self.client.force_login(user1)
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     text2 = Text(body="I'm telling you, Jabba, I can get the money.")
     text2.save()
     text3 = Text(body="Solo! Solo! Too Nakma Noya Solo!", sender="them")
     text3.save()
     texts = self.client.get('/api/texts/')
     self.assertTrue("Jabba no watta" in texts.content.decode())
     self.assertTrue("I'm telling you, Jabba, I can get the money." in
                     texts.content.decode())
     self.assertTrue(
         "Solo! Solo! Too Nakma Noya Solo!" in texts.content.decode())
コード例 #8
0
 def test_text_queryset_returns_sender_attribute_in_json(self):
     """Test that a call to the text api contains the sender attribute."""
     user1 = User()
     user1.save()
     self.client.force_login(user1)
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     texts = self.client.get('/api/texts/')
     self.assertTrue("sender" in texts.content.decode())
     self.assertTrue("them" in texts.content.decode())
コード例 #9
0
ファイル: tests.py プロジェクト: rwisecar/Pyphon
 def test_most_recent_text_body_longer_body(self):
     """Test helper function returns truncated text body."""
     contact = Contact(name="test", number='+12345678910')
     contact.save()
     text = Text(body="this is a sample text for testing",
                 sender="Them",
                 contact=contact)
     text.save()
     truncated_text = "this is a sample tex..."
     self.assertEqual(contact.most_recent_text_body(), truncated_text)
コード例 #10
0
 def test_last_text_view_returns_latest_incoming_text(self):
     """LastText should return latest incoming text."""
     view = LastText.as_view()
     user1 = User()
     user1.save()
     self.client.force_login(user1)
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     request = self.request.get('/sf')
     request.user = user1
     response = view(request)
     self.assertIn('Jabba no watta', response.rendered_content.decode())
     text2 = Text(body="this shouldn't show up.", sender="you")
     text2.save()
     response = view(request)
     self.assertIn('Jabba no watta', response.rendered_content.decode())
     text3 = Text(body="Not the same.", sender="them")
     text3.save()
     response = view(request)
     self.assertIn('Not the same', response.rendered_content.decode())
コード例 #11
0
 def test_text_queryset_is_all_texts(self):
     """Text view should show all texts."""
     user1 = User()
     user1.save()
     self.client.force_login(user1)
     texts = self.client.get('/api/texts/')
     self.assertEqual(len(texts.json()), 0)
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     texts = self.client.get('/api/texts/')
     self.assertEqual(len(texts.json()), 1)
コード例 #12
0
 def test_text_view_template(self):
     """Test that text view uses texts template."""
     user1 = User()
     user1.save()
     self.client.force_login(user1)
     text1 = Text(body="Jabba no watta.",
                  sender="them",
                  contact=self.contacts[0])
     text1.save()
     response = self.client.get(
         reverse_lazy('texts', kwargs={"pk": self.contacts[0].id}))
     self.assertTemplateUsed(response, 'texts/texting.html')
コード例 #13
0
 def test_text_view_status_200(self):
     """Test that text view returns ok status."""
     user1 = User()
     user1.save()
     self.client.force_login(user1)
     text1 = Text(body="Jabba no watta.",
                  sender="them",
                  contact=self.contacts[0])
     text1.save()
     response = self.client.get(
         reverse_lazy('texts', kwargs={"pk": self.contacts[0].id}))
     self.assertTrue(response.status_code == 200)
コード例 #14
0
 def test_api_last_text_view_status_ok(self):
     """Test api contacts view is status ok."""
     user1 = User()
     user1.save()
     self.client.force_login(user1)
     request = self.request.get('/sf')
     request.user = user1
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     view = LastText.as_view()
     response = view(request)
     self.assertEqual(response.status_code, 200)
コード例 #15
0
ファイル: views.py プロジェクト: rwisecar/Pyphon
 def post(self, request, *kwargs):
     parser = FormParser()
     query_dict = parser.parse(request)
     contact = Contact.objects.filter(number=query_dict["From"]).first()
     if not contact:
         contact = Contact(number=query_dict["From"])
         contact.save()
     if contact.number != os.environ["TWILIO_NUMBER"]:
         sender = "them"
     else:
         sender = "you"
     text = Text(sender=sender, contact=contact, body=query_dict["Body"])
     text.save()
     return HttpResponse()
コード例 #16
0
def handle_text_xml(t):
    clean_text = clean(t.text)
    title = text_xml2title(t)
    file_id = text_xml2id(t)
    t = Text.objects.filter(file_id=file_id, title=title)
    if t: print('already found wiki entry in database')
    else:
        t = Text(clean_text=clean_text,
                 title=title,
                 file_id=file_id,
                 main_language=frisian,
                 source=source,
                 text_type=text_type)
        t.save()
        t.all_languages.add(frisian)
    return t
コード例 #17
0
    def done(self, form_list, **kwargs):

        # Handle upload from local file.
        if form_list[0].cleaned_data['method'] == 'local':
            content = form_list[1].cleaned_data['upload'].read()
            filename = form_list[1].cleaned_data['upload'].name
            length = len(content)
            title = form_list[1].cleaned_data['title']
            uri = form_list[1].cleaned_data['uri']
            dateCreated = form_list[1].cleaned_data['dateCreated']
            dateDigitized = form_list[1].cleaned_data['dateDigitized']
            creator = form_list[1].cleaned_data['creator']
            text = Text(uri=uri,
                        title=title,
                        dateCreated=dateCreated,
                        dateDigitized=dateDigitized,
                        content=content,
                        filename=filename,
                        length=length)
            text.save()

            for c in creator:
                text.creator.add(c.id)
            text.save()

        # Handle selection of remote files.
        elif form_list[0].cleaned_data['method'] == 'remote':
            from pprint import pprint
            repo = form_list[1].cleaned_data['repository']
            coll = form_list[2].cleaned_data['collection']
            text = []
            for item in list_items(repo, coll):
                t = handle_item(repo, item)
                if t is not None:
                    text.append(t)

        return render_to_response(
            'texts/done.html', {
                'form_data': [form.cleaned_data for form in form_list],
                'text': text
            })
コード例 #18
0
def handle_item(repo, item):
    cred = repo.credential
    manager = RepositoryManager(cred)
    
    # Ignore items without bitstreams.
    if item['primary_bitstream'] in [ None, '-1' ]:
        return None
    
    try:
        text = Text.objects.get(uri=item['uri'])
        exists = True
    except Text.DoesNotExist:
        exists = False
    
    if not exists:
        # Get bitstream.
        bitstream = manager.get_bitstream(item['primary_bitstream'])
        
        # Get Creators.
        creators = []
        for creator in item['creators']:
            creators.append(retrieve_concept(creator))


        text = Text(    uri = item['uri'],
                        title = item['title'],
                        dateCreated = handle_date(item['dateCreated']),
                        dateDigitized = handle_date(item['dateDigitized']),
                        content = bitstream,
                        filename = item['uri'],
                        length = len(bitstream) )
        text.save()
        for creator in creators:
            text.creator.add(creator)
        text.save()
        
        return text
    return None
コード例 #19
0
    def handle(self, *args, **options):
        source_dir = options['source_dir']
        dir_list = next(os.walk(source_dir))[1]
        base = options['base_texts']
        texts = {}
        base_texts = {} #filepaths to base texts
        working_witnesses = {} # witnesses that are classes as a base text
        base_witnesses = {} # witnesses that the base was copied from
        sources = {}

        for source in Source.objects.all():
            sources[source.name] = source

        # create base source and witness
        working_source, _ = Source.objects.get_or_create(
            name=WORKING_SOURCE_NAME,
            is_working=True,
        )

        # make sure base text is the first witness processed
        sorted_dir_list = []
        for dir in dir_list:
            if dir == base:
                sorted_dir_list.insert(0, dir)
            else:
                sorted_dir_list.append(dir)

        for dir in sorted_dir_list:
            full_dir = os.path.join(source_dir, dir)
            if dir == base:
                is_base = True
            else:
                is_base = False

            if dir not in sources:
                source = Source.objects.create(
                    name=dir,
                    is_base=is_base
                )
                sources[dir] = source
            else:
                source = sources[dir]

            files = next(os.walk(full_dir))[2]

            for filename in files:
                if filename[0] == '.':
                    continue
                filepath = os.path.join(full_dir, filename)

                if 'layout' in filename:
                    continue
                else:
                    text_name = os.path.splitext(filename)[0]
                    if text_name not in texts:
                        text = Text()
                        text.name = text_name
                        text.save()
                        texts[text_name] = text
                    else:
                        text = texts[text_name]

                    witness = Witness()
                    witness.text = text
                    witness.source = source
                    witness.save()

                    if is_base:
                        working_witness = Witness()
                        working_witness.text = text
                        working_witness.source = working_source
                        with open(filepath, 'r') as file:
                            content = file.read()
                            working_witness.content = content
                        working_witness.save()

                        base_texts[text_name] = filepath
                        # base_path = filepath
                        working_witnesses[text_name] = working_witness
                        base_witnesses[text_name] = witness

                        # there won't be any annotations for the base witness clone
                        # or the base witness itself
                        continue
                    else:
                        base_path = base_texts[text_name]

                    # if is_base:
                    #     base_witnesses[text_name] = base_witness
                    #     continue

                    working_witness = working_witnesses[text_name]

                    command_args = f'--start-delete="|-" --stop-delete="-/" --aggregate-changes -d "ཿ།།༌་ \n" "{base_path}" "{filepath}"'
                    command = f"dwdiff {command_args}"

                    try:
                        diff = subprocess.run(shlex.split(command), stdout=subprocess.PIPE, encoding='utf-8').stdout
                    except Exception as e:
                        print(e)

                    try:
                        annotations = parse_word_diff(diff)
                    except Exception as e:
                        annotations = []
                        print(f'dir: {dir}, filename: {filename}')

                    for annotation_data in annotations:
                        annotation = Annotation()
                        annotation.witness = working_witness
                        annotation.start = annotation_data['start']
                        annotation.length = annotation_data['length']
                        annotation.content = annotation_data['replacement']
                        annotation.creator_witness = witness
                        annotation.save()

            for filename in files:
                filepath = os.path.join(full_dir, filename)

                if 'layout' not in filename:
                    continue

                text_name = os.path.splitext(filename)[0].replace('_layout', '')
                # for now, assume page breaks are only for the base witness
                base_origin_witness = base_witnesses[text_name]
                working_witness = working_witnesses[text_name]
                with open(filepath, 'r') as file:
                    content = file.read()

                pb_count = 0
                page_breaks = parse_layout_data(content)
                for page_break in page_breaks:
                    pb_count += 1
                    annotation = Annotation()
                    annotation.witness = working_witness
                    annotation.start = page_break
                    annotation.length = 0
                    annotation.content = ""
                    annotation.creator_witness = base_origin_witness
                    annotation.type = AnnotationType.page_break.value
                    annotation.save()
コード例 #20
0
ファイル: ingest.py プロジェクト: adampmoore/cts
def fetch_texts(ingest_id):
    from texts.models import Corpus, Text
    from annis.models import AnnisServer

    # Define HTML Formats and the ANNIS server to query
    annis_server = AnnisServer.objects.all()[:1]

    if annis_server:
        annis_server = annis_server[0]
        if not annis_server.base_domain.endswith("/"):
            annis_server.base_domain += "/"
    else:
        logger.error("No ANNIS server found")
        return False

    ingest = _retry_getting_ingest(ingest_id)
    if not ingest:
        logger.error('Ingest with ID %d not found in database' % ingest_id)
        return

    logger.info("Starting virtual framebuffer")
    vdisplay = Xvfb()
    try:
        vdisplay.start()
    except Exception as e:
        logger.error('Unable to start Xvfb: %s' % e)

    ingesting_corpora = Corpus.objects.filter(
        id__in=(ingest.corpora.values_list('id', flat=True)))

    try:
        for corpus in ingesting_corpora:
            corpus_name = corpus.annis_corpus_name
            logger.info('Importing corpus ' + corpus.title)
            doc_names_url = annis_server.url_corpus_docname(corpus_name)
            doc_titles = [
                fields[0] for fields in get_selected_annotation_fields(
                    doc_names_url, ('name', ))
            ]
            logger.info('%d documents found for corpus %s: %s' %
                        (len(doc_titles), corpus_name, ', '.join(doc_titles)))

            for title in doc_titles:
                logger.info('Importing ' + title)

                Text.objects.filter(title=title).delete()

                text = Text()
                text.title = title
                text.slug = slugify(title).__str__()
                text.corpus = corpus
                text.ingest = ingest
                text.save()

                doc_meta_url = annis_server.url_document_metadata(
                    corpus_name, text.title)
                metadata.collect_text_meta(doc_meta_url, text)
                vis.collect(corpus, text, annis_server)

                ingest.num_texts_ingested += 1
                ingest.save()

            ingest.num_corpora_ingested += 1
            ingest.save()
    except VisServerRefusingConn:
        logger.error(
            'Aborting ingestion because visualization server repeatedly refused connections'
        )

    vdisplay.stop()

    logger.info('Finished')
コード例 #21
0
    def handle(self, *args, **options):
        csv_filepath = options['csv_file']
        topics = {}
        texts = {}
        authors = {}

        stored_texts = Text.objects.all()
        for text in stored_texts:
            texts[text.code] = text
        
        stored_topics = Topic.objects.all()
        for topic in stored_topics:
            topics[topic.name] = topic

        stored_authors = Author.objects.all()
        for author in stored_authors:
            authors[author.name] = author

        with open(csv_filepath) as csv_file:
            csv_reader = csv.reader(csv_file)
            for row in csv_reader:
                text_code = row[self.TEXT_CODE].strip().lower()
                if not re.match(r'^[a-z][\d]+$', text_code):
                    continue
                title = row[self.TITLE]
                topic_name = row[self.TOPIC]
                author_name = row[self.AUTHOR]

                if text_code not in texts:
                    text = Text()
                    text.name = title
                    text.code = text_code
                    text.save()
                    texts[text_code] = text
                else:
                    text = texts[text_code]
                
                if topic_name not in topics:
                    topic = Topic()
                    topic.name = topic_name
                    topic.save()
                    topics[topic_name] = topic
                else:
                    topic = topics[topic_name]

                if not text.topics.filter(id=topic.id).exists():
                    text.topics.add(topic)
                    text.save()

                if author_name not in authors:
                    author = Author()
                    author.name = author_name
                    author.save()
                    authors[author_name] = author
                else:
                    author = authors[author_name]

                if not text.author or text.author != author:
                    text.author = author
                    text.save()
                else:
                    print(f"author already saved: {author_name}, {author.id}, {text.author.id}")
コード例 #22
0
 def test_add_text_model(self):
     """Test that adding a text model works."""
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     self.assertTrue(Text.objects.count() == 1)
コード例 #23
0
 def test_image_body(self):
     """Test that text instance has correct body."""
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     self.assertTrue(Text.objects.first().body == "Jabba no watta.")
コード例 #24
0
 def test_image_time_format(self):
     """Test that text instance has correct time format."""
     text1 = Text(body="Jabba no watta.", sender="them")
     text1.save()
     self.assertTrue(
         isinstance(Text.objects.first().time, datetime.datetime))
コード例 #25
0
 def test_text_repr_is_body(self):
     """Test that texts are properly represented."""
     text1 = Text(body="No bata tu tu, muni, muni.", sender="them")
     text1.save()
     self.assertTrue(text1.__str__() == text1.body[:20])