def add_collection(self):
     our_cache = self.our_cache
     coll_address = str(uuid.uuid1())
     doc_hash_1 = str(uuid.uuid1())
     doc_hash_2 = str(uuid.uuid1())
     coll = Collection(
         title="Test",
         description="This is a collection!",
         address=coll_address,
         btc="123456789",
         keywords=[
         ],
         documents=[
             Document(
                 description="Test document A",
                 hash=doc_hash_1,
                 title="Test A",
                 ),
             Document(
                 description="Test document B",
                 hash=doc_hash_2,
                 title="Test B",
                 ),
         ],
         creation_date=datetime.datetime.now(),
         oldest_date=datetime.datetime.now(),
         latest_broadcast_date=datetime.datetime.now()
     )
     our_cache.insert_new_collection(coll)
     collections.update_hash(coll)
     our_cache.session.commit()
     return coll
Example #2
0
class TextEditorController(object):
    def __init__(self):
        self.document = Document()

    def open(self, path):
        try:
            self.document.open(path)
        except IOError:
            success = False
        else:
            success = True

        return success, self.document

    def save(self, text, path=None):
        self.document.text = text

        if path is not None:
            self.document.path = path

        try:
            self.document.save()
        except IOError:
            return False
        else:
            return True
Example #3
0
File: tests.py Project: gnarph/DIRT
class DocumentTest(unittest.TestCase):
    file_name = u'models/test_data/lorem.json'
    meta = {'title': u'test 稢綌', 'author': u'gorden 胇赲'}
    body = u'In id tristique orci. 痵痽 犵艿邔 疿疶砳 齸圞趲.'
    pre_file_name = file_name + '_PRE.json'
    raw_file_name = file_name

    def setUp(self):
        self.doc = Document(file_name=self.file_name,
                            metadata=self.meta,
                            pre_file_name=self.pre_file_name,
                            raw_file_name=self.raw_file_name)

    def test_clone(self):
        """
        Test cloning a document
        """
        doc_cloned = self.doc.clone()
        self.assertEqual(doc_cloned.file_name, self.doc.file_name)
        self.assertEqual(doc_cloned.pre_file_name, self.doc.pre_file_name)
        self.assertEqual(doc_cloned.raw_file_name, self.doc.raw_file_name)
        self.assertEqual(doc_cloned.metadata, self.doc.metadata)
        self.assertEqual(doc_cloned.raw_body, self.doc.raw_body)
        self.assertEqual(self.doc, doc_cloned)

        # using assertFalse instead of assertNotEqual in order to
        # test __eq__
        doc_cloned.file_name = u'nope'
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.file_name = self.doc.file_name
        doc_cloned.metadata = None
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.metadata = self.doc.metadata
        doc_cloned.raw_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.raw_file_name = self.doc.raw_file_name
        doc_cloned.pre_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

    def test_to_dict(self):
        """
        Test conversion to dictionary (for json serialization)
        """
        doc_dict = self.doc.to_dict()
        self.assertEqual(doc_dict['file_name'], self.doc.file_name)
        self.assertEqual(doc_dict['metadata'], self.doc.metadata)
        self.assertEqual(doc_dict['pre_file_name'], self.doc.pre_file_name)
        # TODO check raw

    def test_open(self):
        """
        Test opening a Document json
        """
        self.assertRaises(InvalidDocumentException, Document.from_json,
                          'models/test_data/invalid.json')
        self.assertRaises(InvalidDocumentException, Document.from_json,
                          'models/test_data/invalid.txt')
Example #4
0
    def test_open_blank_file(self):
        document = Document()
        text_file = open("test_file", "w")
        text_file.close()
        document.open("test_file")

        self.assertEquals("test_file", document.path)
        self.assertEquals("", document.text)
Example #5
0
 def test_open_blank_file(self):
     document = Document()
     text_file = open("test_file", "w")
     text_file.close()
     document.open("test_file")
     
     self.assertEquals("test_file", document.path)
     self.assertEquals("", document.text)
Example #6
0
    def test_open_text_file(self):
        document = Document()
        text_file = open("test_file", "w")
        text_file.write("this is only a test")
        text_file.close()
        document.open("test_file")

        self.assertEquals("test_file", document.path)
        self.assertEquals("this is only a test", document.text)
Example #7
0
    def add_item_name_test(self):
        controller = DocumentListController()
        document = Document()
        document.path = "/path/to/test.tf"

        controller.add(document)
        item = controller.association.keys()[0]

        self.assertEquals("test.tf", item.text())
Example #8
0
 def test_open_text_file(self):
     document = Document()
     text_file = open("test_file", "w")
     text_file.write("this is only a test")
     text_file.close()
     document.open("test_file")
     
     self.assertEquals("test_file", document.path)
     self.assertEquals("this is only a test", document.text)
Example #9
0
 def test_text_save_file(self):
     document = Document()
     document.text = "this is only a test of save file"
     document.path = "test_file"
     
     document.save()
     
     text_file = open(document.path, "r")
     self.assertEquals(document.text, text_file.read())
Example #10
0
 def test_save_inexistent_file(self):
     document = Document()
     document.text = "this is only a test of save file"
     document.path = "test_file"
     
     document.save()
     
     self.assertTrue(os.path.exists(document.path))
     self.assertEquals("this is only a test of save file", document.text)
Example #11
0
 def add_item_name2_test(self):
     controller = DocumentListController()
     
     document = Document()
     document.path = "/path/to/othertest.tf"
     
     controller.add(document)
     item = controller.association.keys()[0]
     
     self.assertEquals("othertest.tf", item.text())
Example #12
0
def parse_invoices(entries, account):
    for entry in entries:
        i = entry['invoices']
        invoice = db.query(Document).filter_by(uid=i['id']).first()
        partner = db.query(Partner).filter_by(uid=i['client_id']).first()

        if invoice is None:
            invoice = Document(uid=i['id'],
                               account=account,
                               tags=[typetags['invoice']])
            db.add(invoice)
        else:
            u = list(set(invoice.tags).difference(set(tags.values())))
            if u: invoice.tags = u

        invoice.name = i['subject']
        invoice.value = i['amount']
        invoice.date = datetime.strptime(i['issued_at'], '%Y-%m-%d')
        invoice.updated_at = datetime.strptime(i['updated_at'],
                                               '%Y-%m-%dT%H:%M:%SZ')
        invoice.meta = json.dumps(i)
        invoice.partner = partner
        invoice.tags.append(tags[i['state']])

        if i['state'] == 'open' and datetime.strptime(
                i['due_at'], '%Y-%m-%d') < datetime.now():
            invoice.tags.append(tags['due'])
Example #13
0
    def change_filename_test(self):
        controller = DocumentListController()

        document = Document()
        document.path = "/path/to/othertest.tf"

        controller.add(document)
        item = controller.association.keys()[0]
        controller.change_filename(document, "/new/file/name.tf")

        self.assertEquals("name.tf", item.text())
Example #14
0
    def test__get_revision_by_timestamp(self):
        timestamp = datetime(2020, 1, 1, 1, 1, 1)

        revision = Revision(id=1,
                            content='hello',
                            timestamp=timestamp,
                            document_id=5)

        document = Document(id=5, title='blah', revisions=[revision])

        assert document.get_revision_by_timestamp(timestamp) == revision
Example #15
0
 def change_filename_test(self):
     controller = DocumentListController()
     
     document = Document()
     document.path = "/path/to/othertest.tf"
     
     controller.add(document)
     item = controller.association.keys()[0]
     controller.change_filename(document, "/new/file/name.tf")
     
     self.assertEquals("name.tf", item.text())
Example #16
0
    def remove_item_dict2_test(self):
        controller = DocumentListController()

        document = Document()
        document2 = Document()
        controller.add(document)
        controller.add(document2)
        controller.remove(document)

        self.assertEquals(1, len(controller.association))
        self.assertTrue(document2 in controller.association.values())
        self.assertFalse(document in controller.association.values())
Example #17
0
 def from_dict(d):
     """
     Convert dict representation to MatchSet
     :param d: dict representation of a MatchSet
     :return: MatchSet
     """
     matches = [Match.from_dict(m) for m in d['matches']]
     alpha = Document.from_json(d['alpha_doc'])
     beta = Document.from_json(d['beta_doc'])
     return MatchSet(alpha_doc=alpha,
                     beta_doc=beta,
                     matches=matches)
Example #18
0
 def test_replace_text_save_file(self):
     text_file = open("test_file", "w")
     text_file.write("this is only a test file")
     text_file.close()
     
     document = Document()
     document.text = "I changed the text"
     document.path = "test_file"
     
     document.save()
     
     text_file = open ("test_file", "r")
     self.assertEquals(document.text, text_file.read())
 def setUp(self):
     self.controller = Controller()
     self.cache = self.controller.cache
     self.address = 'ffafaf'
     coll_address = str(uuid.uuid1())
     doc_hash_1 = str(uuid.uuid1())
     doc_hash_2 = str(uuid.uuid1())
     doc_hash_3 = str(uuid.uuid1())
     self.test_collection_evil = Collection(
         title="Test multiple33333",
         description="This is a collection! with multiple docs222",
         address=self.address,
         btc="123456789",
         keywords=[
             Keyword(name="Keyword A", id=1199),
             Keyword(name="Keyword c", id=1214),
         ],
         documents=[
             Document(
                 description="Test document Z",
                 hash="zzzzzzzz",
                 title="Test Z",
                 accesses=0,
                 filename="joe.txt",
                 collection_address="BM-2cSrapXpgDTFD8AyDmU1BGifNkB2Z6X9k8"
             ),
             Document(
                 description="Test document B",
                 hash='gdssgsdg',
                 title="Test B",
                 accesses=3,
                 filename="gile.txt",
                 collection_address="BM-2cSrapXpgDTFD8AyDmU1BGifNkB2Z6X9k8"
             ),
             Document(
                 description="Test document Bddd",
                 hash='afff',
                 title="Test B",
                 accesses=3,
                 filename="gile.txt",
                 collection_address="BM-2cSrapXpgDTFD8AyDmU1BGifNkB2Z6X9k8"
             ),
         ],
         creation_date=datetime.datetime.now(),
         oldest_date=datetime.datetime.now(),
         latest_broadcast_date=datetime.datetime.now(),
         latest_btc_tx="btctx1",
         oldest_btc_tx="btctx12",
         accesses=2,
         votes=3,
         votes_last_checked=datetime.datetime.now())
Example #20
0
File: DIRT.py Project: gnarph/DIRT
def process_parallel_worker(a, output_dir, gap_length, match_length, b, comparator):
    """
    Worker for processing two files at a time in parallel
    """
    comparator_path = COMPARATOR_PATH.format(comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=output_dir,
                              comparator=comparator,
                              gap_length=gap_length,
                              match_length=match_length,
                              percentage_match_length=None)
    alpha = Document.from_json(a)
    beta = Document.from_json(b)
    pro.process(alpha_document=alpha, beta_document=beta)
Example #21
0
    def generate_documents(self, number) -> List[Document]:
        current_id = 1
        documents = []
        
        # create 6 NOT_RELEVANT documents 
        for i in range(number):
            documents.append(Document(current_id, 0))#Relevance.NOT_RELEVANT))
            current_id += 1

        # create 6 RELEVANT documents
        for i in range(number):
            documents.append(Document(current_id, 1))#Relevance.RELEVANT))
            current_id += 1

        return documents
Example #22
0
def process_parallel_worker(a, output_dir, gap_length, match_length, b,
                            comparator):
    """
    Worker for processing two files at a time in parallel
    """
    comparator_path = COMPARATOR_PATH.format(comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=output_dir,
                              comparator=comparator,
                              gap_length=gap_length,
                              match_length=match_length,
                              percentage_match_length=None)
    alpha = Document.from_json(a)
    beta = Document.from_json(b)
    pro.process(alpha_document=alpha, beta_document=beta)
Example #23
0
	def post(self):
		self.response.write('<html><body>')
		htmlcontent = self.request.get('htmlcontent')
		csscontent = self.request.get('csscontent')	
		documentName = self.request.get('documentName')
		humanname = self.request.get('humanname')
		
		documents_query = Document.query(ancestor=Document.getkey(documentName))
		document = documents_query.fetch()[0]
		
		document.htmlcontent = htmlcontent
		document.csscontent = csscontent
		document.name = humanname

		document.put()
Example #24
0
    def _build_docs_keywords(self, payload, collection):
        """
        Builds a list of Keyword objects and a list of Document objects from the received json.

        :param payload: The payload of the FJ Message including the documents and keywords
        :return: Two lists representing the documents and keywords of the FJ Message
        """
        for key in payload["keywords"]:
            db_key = self.cache.get_keyword_by_id(key["id"])
            if db_key is not None:
                collection.keywords.append(db_key)
            else:
                collection.keywords.append(Keyword(name=key["name"]))

        for doc in payload["documents"]:
            db_doc = self.cache.get_document_by_hash(doc["hash"])
            if db_doc is not None:
                collection.documents.append(db_doc)
            else:
                collection.documents.append(
                    Document(collection_address=doc["address"],
                             description=doc["description"],
                             hash=doc["hash"],
                             title=doc["title"],
                             filename=doc["filename"],
                             accesses=doc["accesses"]))
def add_collection():
    global our_cache
    coll_address = str(uuid.uuid1())
    doc_hash_1 = str(uuid.uuid1())
    doc_hash_2 = str(uuid.uuid1())
    coll = Collection(
            title="Test",
            description="This is a collection!",
            address=str(uuid.uuid1()),
            btc=str(uuid.uuid1()),
            keywords=[
            ],
            documents=[
                Document(
                    collection_address=doc_hash_1,
                    description="Test document A",
                    hash=str(uuid.uuid1()),
                    title="Test A",
                    ),
            ],
            creation_date=datetime.datetime.now(),
            oldest_date=datetime.datetime.now(),
            latest_broadcast_date=datetime.datetime.now()
    )
    our_cache.insert_new_collection(coll)
    collections.update_hash(coll)
    our_cache.session.commit()
    return coll
Example #26
0
    def search(self, query) -> Documents:
        title_boost = 'title^' + str(ELASTIC_TITLE_BOOST)
        text_boost = 'text^' + str(ELASTIC_TEXT_BOOST)
        s = Search(using=self.client, index=INDEX_NAME) \
            .query("multi_match", query=query, fields=[title_boost, text_boost])
        response = s.execute()

        table = PrettyTable(['Index', 'Title', 'Score', 'Popularity'])
        docs = Documents()
        skip_count = 0
        for idx, doc in enumerate(response):
            if not any(excl in doc.title for excl in TITLE_EXCLUDES) and \
               not any(excl in doc.category for excl in CAT_EXCL) and \
               REFER_TEXT not in doc.text:
                # uft encode values
                doc.title = str.encode(doc.title, encoding='utf-8').decode(encoding='utf-8')
                doc.text = str.encode(doc.text, encoding='utf-8').decode(encoding='utf-8')

                docs.add(Document(doc.title, doc.text, doc.meta.score, idx))
                format_num = lambda x: '{0:.2f}'.format(x)
                table.add_row([idx, doc.title, format_num(doc.meta.score), doc.popularity_score])
            else:
                skip_count += 1
        Logger.info('Elastic result:\n' + str(table))
        Logger.info(str(skip_count) + ' elastic results were skipped')
        return docs
Example #27
0
def get_documents():
    page = int(request.args.get('page', 1))
    order = request.args.get('order', 'desc')

    purchases = Document.get_by_page(order, page)

    return response(documents_schema.dump(purchases))
Example #28
0
def put_document(file_path, collection_address, title, description):
    """ Insert a document into the local cache with associated information
        and upload the document to the freenet network.
        :param file_path: the path of the file to upload
        :param collection_address: the collection address associated with the document
        :param title: the title of the document being uploaded
        :param description: the description of the document being uploaded
    """
    file_name = os.path.basename(file_path)
    contents = open(file_path).read()
    freeCon = FreenetConnection()
    uri = freeCon.put(contents)
    name, extension = os.path.splitext(file_name)
    hash_name = uri
    new_file_name = hash_name + extension
    shutil.copy(file_path, os.path.expanduser(config.DOCUMENT_DIRECTORY_PATH) + new_file_name)
    document = Document(
        collection_address = collection_address,
        description = description,
        hash = uri,
        title = title,
        filename = new_file_name,
        accesses = 0
    )
    cache.insert_new_document(document)
    collection = cache.get_collection_with_address(collection_address)
    collections.update_hash(collection)
    print ("Inserted " + file_path + " successfully with URI " + uri)
    print ("Allow up to 10 minutes for file to propogate on the freenet network")
Example #29
0
def convert_to_objects(a_paths, corpus, encoding, train_size):
    docs = []
    for path in a_paths[:train_size]:
        if ('MADE-1.0' in corpus):
            e_list, r_list = parse_xml(path, encoding)
            kwargs_for_doc = {
                'entities':
                e_list,
                'references':
                r_list,
                'annotation_path':
                path,
                'text_path':
                path.replace('annotations', 'corpus').replace('.bioc.xml', ''),
            }
        elif ('corpus_release' in corpus):
            e_list, r_list = parse_brat(path, encoding)
            kwargs_for_doc = {
                'entities': e_list,
                'references': r_list,
                'annotation_path': path,
                'text_path': path.replace('ann', 'txt'),
            }
        fictive_relations = get_fictive_relations(
            e_list, r_list, kwargs_for_doc.get('text_path'), encoding)
        kwargs_for_doc.update({'references': r_list + fictive_relations})
        docs.append(Document(**kwargs_for_doc))
    return docs
Example #30
0
    def get_item_from_document_test(self):
        controller = DocumentListController()

        document = Document()
        controller.add(document)

        item = controller.get_item_from_document(document)
        self.assertEquals(document, controller.association[item])
Example #31
0
    def remove_item_dict_test(self):
        controller = DocumentListController()

        document = Document()
        controller.add(document)
        controller.remove(document)

        self.assertEquals(0, len(controller.association))
Example #32
0
    def setUp(self):
        self.controller = Controller()
        self.address = self.controller.connection.create_address('Controller Test address', True)

        coll_address = str(uuid.uuid1())
        doc_hash_1 = str(uuid.uuid1())
        doc_hash_2 = str(uuid.uuid1())
        doc_hash_3 = str(uuid.uuid1())

        self.test_collection = Collection(
            title="Test",
            description="This is a collection!",
            address=self.address,
            btc="123456789",
            keywords=[
                Keyword(name="Keyword A"),
                Keyword(name="Keyword B"),
            ],
            documents=[
                Document(
                    description="Test document A",
                    hash=doc_hash_1,
                    title="Test A",
                    accesses=0,
                    filename="joe.txt",
                    collection_address="afgagahhsgh"
                    ),
                Document(
                    description="Test document B",
                    hash=doc_hash_2,
                    title="Test B",
                    accesses=3,
                    filename="gile.txt",
                    collection_address="afgagasghhhss"
                    ),
            ],
            creation_date=datetime.datetime.now(),
            oldest_date=datetime.datetime.now(),
            latest_broadcast_date=datetime.datetime.now(),
            latest_btc_tx="btctx1",
            oldest_btc_tx="btctx12",
            accesses=2,
            votes=3,
            votes_last_checked=datetime.datetime.now()
        )
        self.test_signature = Signature(pubkey='itsakey',address=self.address)
Example #33
0
def get_documents_date():
    date_to = request.args.get('to', datetime.today())
    date_from = request.args.get('from', datetime.today())
    date_from = datetime.fromisoformat(date_from)
    date_to = datetime.fromisoformat(date_to) + timedelta(days=1)
    purchases = Document.get_by_dates(document_type='COMPRA',\
         date_from=date_from, date_to=date_to)

    return response(documents_schema.dump(purchases))
Example #34
0
    def add_dict_test(self):
        controller = DocumentListController()
        document = Document()

        controller.add(document)

        self.assertEquals(document, controller.association.values()[0])
        self.assertTrue(
            type(controller.association.keys()[0]) == QtGui.QStandardItem)
Example #35
0
	def get(self):
		documentName = self.request.get('documentName')
		documents_query = Document.query(ancestor=Document.getkey(documentName))
		
		document = documents_query.fetch()
		if len(document) == 0:
			self.response.write(TEMPLATE_404.render({}))
			return

		document = document[0]
		template = JINJA_ENVIRONMENT.get_template('editor.html')
		self.response.write(template.render({
			'htmlcontent' : document.htmlcontent,
			'csscontent' : document.csscontent,
			'humanname' : document.name,
			'id' : documentName,
			'title' : 'Editing ' + document.name
		}))
Example #36
0
    def test__get_latest_revision(self):
        timestamp = datetime(2020, 1, 1, 1, 1, 1)
        timestamp_latest = datetime(2020, 2, 1, 1, 1, 1)

        revision = Revision(id=1,
                            content='hello',
                            timestamp=timestamp,
                            document_id=5)

        revision_latest = Revision(id=2,
                                   content='hello again',
                                   timestamp=timestamp_latest,
                                   document_id=5)

        document = Document(id=5,
                            title='blah',
                            revisions=[revision, revision_latest])

        assert document.get_latest_revision() == revision_latest
Example #37
0
    def remove_item_return_test(self):
        controller = DocumentListController()

        document = Document()
        controller.add(document)

        document_item = controller.association.items()[0][0]
        removed_item = controller.remove(document)

        self.assertEquals(document_item, removed_item)
Example #38
0
    def process(self):
        """
        Perform processing
        Creates raw and preprocessed versions of the input file
        as well as a json file representing the models.Document
        """
        start_time = time.time()
        name = path.get_name(self.file_name, extension=False)
        output_name = name + PREPROCESS_SUFFIX
        in_file = self.file_name
        out_file = os.path.join(self.output_dir, output_name)
        if file_ops.exists(out_file):
            # Already preprocessed
            return

        if in_file.endswith('.tei') or in_file.endswith('.xml'):
            reader = TEIReader(in_file)
            raw_text, metadata = reader.read()
        else:
            raw_text = file_ops.read_utf8(in_file)
            metadata = {}

        raw_file = os.path.join(self.output_dir,
                                'raw' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(raw_file, raw_text)

        processed_text = self.standardizer.standardize(raw_text)
        pre_file = os.path.join(self.output_dir,
                                'pre' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(pre_file, processed_text)

        out_document = Document(file_name=self.file_name,
                                raw_file_name=raw_file,
                                pre_file_name=pre_file,
                                metadata=metadata)
        processed_dict = out_document.to_dict()
        file_ops.write_json_utf8(out_file, processed_dict)

        duration = time.time() - start_time
        self._log_duration(duration, self.file_name, len(raw_text))
Example #39
0
def process_serial(args, alpha_files, beta_files):
    """
    Process on a single thread
    """
    comparator_path = COMPARATOR_PATH.format(args.comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=args.output_dir,
                              comparator=comparator,
                              gap_length=args.gap_length,
                              match_length=args.match_length,
                              percentage_match_length=None)
    compared = []
    for a, b in itertools.product(alpha_files, beta_files):
        this_set = sorted([a, b])
        if a != b and this_set not in compared:
            alpha = Document.from_json(a)
            beta = Document.from_json(b)
            pro.process(alpha_document=alpha, beta_document=beta)
            compared.append(this_set)
    return len(compared)
Example #40
0
 def test_different_root_hash(self):
     d = Document(
         description="Test document A",
         hash="asdfasdfa;sldkfja;sldkfja;dljkfa;ldf",
         collection_address="bm-first",
         title="Test A",
     )
     d2 = Document(
         description="Test document B",
         hash="fdasdfsdfsdfsdfsdfsdfsdfdfsdfsddfdfdf",
         collection_address="bm-first",
         title="Test B",
     )
     self.cache.insert_new_document(d)
     collections.update_hash(self.collection1)
     self.cache.insert_new_document(d2)
     collections.update_hash(self.collection1)
     versions = self.cache.get_versions_for_collection(
         self.collection1.address)
     self.assertTrue(versions[0].root_hash != versions[1].root_hash)
Example #41
0
File: DIRT.py Project: gnarph/DIRT
def process_serial(args, alpha_files, beta_files):
    """
    Process on a single thread
    """
    comparator_path = COMPARATOR_PATH.format(args.comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=args.output_dir,
                              comparator=comparator,
                              gap_length=args.gap_length,
                              match_length=args.match_length,
                              percentage_match_length=None)
    compared = []
    for a, b in itertools.product(alpha_files, beta_files):
        this_set = sorted([a, b])
        if a != b and this_set not in compared:
            alpha = Document.from_json(a)
            beta = Document.from_json(b)
            pro.process(alpha_document=alpha, beta_document=beta)
            compared.append(this_set)
    return len(compared)
Example #42
0
    def test__get_revision_by_timestamp_expect_most_recent(self):
        timestamp = datetime(2020, 1, 1, 1, 1, 1)
        timestamp_latest = datetime(2020, 2, 1, 1, 1, 1)
        timestamp_in_between = datetime(2020, 1, 29, 1, 1, 1)

        revision = Revision(id=1,
                            content='hello',
                            timestamp=timestamp,
                            document_id=5)

        revision_latest = Revision(id=2,
                                   content='hello again',
                                   timestamp=timestamp_latest,
                                   document_id=5)

        document = Document(id=5,
                            title='blah',
                            revisions=[revision, revision_latest])

        assert document.get_revision_by_timestamp(
            timestamp_in_between) == revision
Example #43
0
	def get(self):		
		htmlcontent = self.request.get('htmlcontent')
		csscontent = self.request.get('csscontent')	
		documentName = self.request.get('documentName')
		
		documents_query = Document.query(ancestor=Document.getkey(documentName))
		document = documents_query.fetch()

		if len(document) == 0:
			self.response.write(TEMPLATE_404.render({}))
			return

		document = document[0]
		template = JINJA_ENVIRONMENT.get_template('view.html')
		self.response.write(template.render({
			'htmlcontent' : document.htmlcontent,
			'csscontent' : document.csscontent,
			'editurl' : '/edit?' + urllib.urlencode({'documentName' : documentName}),
			'title' : document.name,
			'id' : documentName
		}))
Example #44
0
File: tests.py Project: gnarph/DIRT
    def setUp(self):
        self.passages_a = [chr(i + ord('a')) for i in xrange(10)]
        self.passages_b = [chr(i + ord('A')) for i in xrange(10)]
        self.file_a = 'models/test_data/match_set_test.json'
        self.document_a = Document.from_json(self.file_a)
        self.file_b = 'models/test_data/match_set_test2.json'
        self.document_b = Document.from_json(self.file_b)

        self.matches = []
        self.singlet_pairs = []
        for i in xrange(len(self.passages_a)):
            a = MatchHalf(passage=self.passages_a[i])
            b = MatchHalf(passage=self.passages_b[i])
            s_pair = (a, b)
            self.singlet_pairs.append(s_pair)
            # Alpha/beta need to be actual documents, not names
        self.matches = Processor.singlet_pairs_to_matches(alpha=self.document_a,
                                                          beta=self.document_b,
                                                          singlet_pairs=self.singlet_pairs)
        self.match_set = MatchSet(alpha_doc=self.document_a,
                                  beta_doc=self.document_b,
                                  matches=self.matches)
Example #45
0
	def get(self):
		documents_query = Document.query()
		documents = documents_query.fetch()

		documents = map(lambda x: { 
			'name' : x.documentName, 
			'url' : '/view?' + urllib.urlencode({'documentName' : x.documentName}),
			'editurl' : '/edit?' + urllib.urlencode({'documentName' : x.documentName}),
			'humanname' : x.name or '[No name]',
			'date' : x.date
			}, documents)

		template = JINJA_ENVIRONMENT.get_template('list.html')
		self.response.write(template.render({ 'list' : documents }))
Example #46
0
File: tests.py Project: gnarph/DIRT
 def test_read(self):
     """
     Test reading of TEI xml file
     """
     real_data_file = self._get_test_file_name(TEI_ZHI)
     tei_doc = tei_document.TEIDocument(real_data_file)
     tei_data = tei_doc.get_data()
     tei_body = tei_data['body']
     r = reader.TEIReader(real_data_file)
     read_body, read_metadata = r.read()
     self.assertEqual(tei_body, read_body)
     json_name = self._get_test_file_name(JSON_ZHI)
     global_doc = Document.from_json(json_name)
     self.assertEqual(read_body, global_doc.raw_body)
     self.assertEqual(read_metadata, global_doc.metadata)
Example #47
0
File: tests.py Project: gnarph/DIRT
 def test_smoke(self):
     """
     Smoke test - check that the preprocessor runs without exploding
     """
     pp = Preprocessor(file_name=self.file_name,
                       input_dir=self.input_dir,
                       output_dir=self.output_dir)
     pp.process()
     out_dir_files = os.listdir(self.output_dir)
     for file_name in out_dir_files:
         name = utilities.path.get_name(self.file_name,
                                        extension=False)
         if name in file_name:
             file_path = os.path.join(self.output_dir, file_name)
             doc = Document.from_json(file_path)
             self.assertNotEqual(doc.pre_file_name, self.file_name)
             self.assertEqual(doc.file_name, 'test_preprocessed/lorem.json')
Example #48
0
File: tests.py Project: gnarph/DIRT
 def setUp(self):
     self.doc = Document(file_name=self.file_name,
                         metadata=self.meta,
                         pre_file_name=self.pre_file_name,
                         raw_file_name=self.raw_file_name)
Example #49
0
	def post(self):
		documentName = Document.newname()
		htmlcontent = constants.HTML_DEFAULT
		csscontent = constants.CSS_DEFAULT

		documents = Document(parent=Document.getkey(documentName))		
		documents.htmlcontent = htmlcontent
		documents.csscontent = csscontent
		documents.documentName = documentName
		documents.name = constants.DEFAULT_NAME
		documents.key = Document.getkey(documentName)
		documents.put()
		
		query_params = {'documentName':documentName}
		self.redirect('/edit?' + urllib.urlencode(query_params))		
Example #50
0
File: tests.py Project: gnarph/DIRT
class DocumentTest(unittest.TestCase):
    file_name = u'models/test_data/lorem.json'
    meta = {'title': u'test 稢綌',
            'author': u'gorden 胇赲'
            }
    body = u'In id tristique orci. 痵痽 犵艿邔 疿疶砳 齸圞趲.'
    pre_file_name = file_name + '_PRE.json'
    raw_file_name = file_name

    def setUp(self):
        self.doc = Document(file_name=self.file_name,
                            metadata=self.meta,
                            pre_file_name=self.pre_file_name,
                            raw_file_name=self.raw_file_name)

    def test_clone(self):
        """
        Test cloning a document
        """
        doc_cloned = self.doc.clone()
        self.assertEqual(doc_cloned.file_name, self.doc.file_name)
        self.assertEqual(doc_cloned.pre_file_name, self.doc.pre_file_name)
        self.assertEqual(doc_cloned.raw_file_name, self.doc.raw_file_name)
        self.assertEqual(doc_cloned.metadata, self.doc.metadata)
        self.assertEqual(doc_cloned.raw_body, self.doc.raw_body)
        self.assertEqual(self.doc, doc_cloned)

        # using assertFalse instead of assertNotEqual in order to
        # test __eq__
        doc_cloned.file_name = u'nope'
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.file_name = self.doc.file_name
        doc_cloned.metadata = None
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.metadata = self.doc.metadata
        doc_cloned.raw_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.raw_file_name = self.doc.raw_file_name
        doc_cloned.pre_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

    def test_to_dict(self):
        """
        Test conversion to dictionary (for json serialization)
        """
        doc_dict = self.doc.to_dict()
        self.assertEqual(doc_dict['file_name'], self.doc.file_name)
        self.assertEqual(doc_dict['metadata'], self.doc.metadata)
        self.assertEqual(doc_dict['pre_file_name'], self.doc.pre_file_name)
        # TODO check raw

    def test_open(self):
        """
        Test opening a Document json
        """
        self.assertRaises(InvalidDocumentException,
                          Document.from_json,
                          'models/test_data/invalid.json')
        self.assertRaises(InvalidDocumentException,
                          Document.from_json,
                          'models/test_data/invalid.txt')