Ejemplo n.º 1
0
 def encoding(self):
     if self._encoding is None:
         mime = parse_mimetype(self.headers.get('content-type'))
         self._encoding = mime.charset
     if self._encoding is None:
         with open(self.file_path, 'rb') as fh:
             self._encoding = guess_file_encoding(fh)
     return self._encoding
Ejemplo n.º 2
0
 def to_dict(self):
     mime = parse_mimetype(self.mime_type)
     return {
         "path": self.path,
         "sha1": self.checksum,
         "timestamp": self.timestamp,
         "dataset": self.dataset,
         "mime_type": self.mime_type,
         "mime_type_label": mime.label,
         "size": self.size,
         "title": self.title,
     }
Ejemplo n.º 3
0
def all_resources(conn: Conn,
                  dataset: Dataset) -> Generator[Resource, None, None]:
    q = select(resource_table)
    q = q.filter(resource_table.c.dataset == dataset.name)
    q = q.order_by(resource_table.c.path.asc())
    result = conn.execute(q)
    for row in result.fetchall():
        resource = cast(Resource, row._asdict())
        # Add mime type label for the web UI. Should this live here?
        mime_type = resource["mime_type"]
        if mime_type is not None:
            mime = parse_mimetype(mime_type)
            resource["mime_type_label"] = mime.label
        resource["url"] = dataset.make_public_url(resource["path"])
        yield resource
Ejemplo n.º 4
0
 def caption(self, value: str) -> str:
     return parse_mimetype(value).label or value
Ejemplo n.º 5
0
 def caption(self, value):
     return parse_mimetype(value).label
Ejemplo n.º 6
0
from pprint import pprint
from collections import defaultdict
import csv

from pantomime import parse_mimetype

data = defaultdict(int)
with open('occrp.csv', 'r') as fh:
    reader = csv.reader(fh, delimiter=';')
    for row in reader:
        original, count = row
        parsed = parse_mimetype(original)
        print(parsed.label)
        # data[parsed.normalized] += int(count)
        # if parsed.normalized != original:
        #     pprint((original, parsed.label))

print(len(data))