Exemple #1
0
def parse(query):
    if query == "":
        return File.select()

    tokens = tokenize(query)
    result = _parse(iter(tokens))
    try:
        iter(result)
    except TypeError:
        result = File.select().join(Metadata, peewee.JOIN_LEFT_OUTER).where(result)
    return result
Exemple #2
0
def get_entries():
    """Gets all file names currently in catalog"""

    catalog_files = []

    if File.select():
        for item in File.select():
            catalog_files.append(item.file_name)
    else:
        return catalog_files

    return catalog_files
Exemple #3
0
def process(args, pool, log):
    files = File.select().where(File.inspected_at.is_null()).execute()
    paths = [pathlib.Path(f.path) for f in files]

    count_total = file_count_total(paths)
    size_total = file_size_total(paths)

    parse = functools.partial(record_for, args)

    count_processed = 0
    size_processed = 0

    for (path, result, messages) in pool.imap(parse, paths):
        #for (path, result, messages) in map(parse, input_iterator(args)):
        count_processed += 1
        size_processed += file_size(path)

        for message in messages:
            print('error: {}: {}'.format(message, path))

        if result is None:
            mark_processed(path)
            continue

        save_media(result)

        print('process: {} {} {}/{} {}/{}'.format(
            result['source'], result['path'], count_processed, count_total,
            humanize.naturalsize(size_processed),
            humanize.naturalsize(size_total)))
Exemple #4
0
def view_all_entries():
    """Gather all entries in the catalog, along with their metadata"""

    all_entries = (FileTag
                   .select(FileTag, File, Tag)
                   .join(Tag)
                   .switch(FileTag)
                   .join(File)
                   .order_by(File.file_name))

    # dictionary houses all files in catalog and each file's associated tags
    files_with_tags = {}

    for file_tag in all_entries:

        f = file_tag.file_id
        tag = file_tag.tag_id.tag_name

        files_with_tags.setdefault(f.file_name, []).append(tag)

    ordered_list = sorted([file_name for file_name in File.select()])

    # Creates a list called of tuples called 'entries' 
    # each tuple containing name of catalog file and metadata
    entries = []
    for f_name in ordered_list:
        f_name.file_name
        f_name.description
        f_name.date_created
        tags = ' | '.join(sorted(files_with_tags.get(f_name.file_name, '')))

        entries.append((f_name.file_name, f_name.description, f_name.date_created, tags))

    return entries
Exemple #5
0
def Token(token):
    if ':' in token:
        field, value = token.split(':', 1)
    else:
        field, value = 'tag', token
    files = File.select().join(Metadata, peewee.JOIN_LEFT_OUTER)
    return set(files.where((Metadata.field == field) & (Metadata.value == value)))
Exemple #6
0
    def get(self):
        obj = {
            'categories': Category.select()[:],
            'items': Item.select()[:],
            'files': File.select()[:]
        }

        return json_response(obj, exclude=('blob'))
Exemple #7
0
def fetch_me_file_count(args):
    if args.authorization is None and args.fingerprint is None:
        raise ApiError(status=401)

    if args.mimetype is not None and args.type is not None:
        raise ApiError('Choose Mimetype or Type, not both')

    mtype = None
    if args.type is not None:
        mtype = MimetypeType.get_or_none(id=args.type)
        if mtype is None:
            raise ApiError(metadata={'errors': {'type': 'Invalid Type'}})

    mimetype = None
    if args.mimetype is not None:
        mimetype = Mimetype.get_or_none(id=args.mimetype)
        if mimetype is None:
            raise ApiError(
                metadata={'errors': {
                    'mimetype': 'Invalid Mimetype'
                }})

    if args.authorization is not None:
        user = helpers.get_user(args.authorization)
        query = File.select().where(File.user == user)
    else:
        query = File.select().where(File.fingerprint == args.fingerprint)

    if mtype is not None:
        query = (query.join(Mimetype,
                            JOIN.RIGHT_OUTER,
                            on=(Mimetype.id == File.mimetype)).join(
                                MimetypeType,
                                JOIN.RIGHT_OUTER,
                                on=(MimetypeType.id == Mimetype.type)).where(
                                    MimetypeType.id == mtype).switch(File))

    if mimetype is not None:
        query = query.where(File.mimetype == mimetype)

    return ApiResponse({'count': query.count()})
Exemple #8
0
    def handle_T_key(self):
        # Get the File objects corresponding to the selected thumbnails
        files = File.select().where(
            File.md5 << [item.Text for item in self.get_selected_thumbs()])

        # Determine the existing tags for these files.
        old_tags = Metadata.filter(Metadata.file << files,
                                   Metadata.field.not_in(['import-time']))
        old_tags = sorted(
            list(set(['%s:"%s"' % (t.field, t.value) for t in old_tags])))

        dialog = wx.TextEntryDialog(None,
                                    "Tags:",
                                    "Modifiy Tags",
                                    value=", ".join(old_tags))
        if dialog.ShowModal() == wx.ID_OK:

            # Determine the new tags for these files.
            new_tags = dialog.GetValue()
            new_tags = [t.strip() for t in new_tags.split(",")]

            # Add any new tags that have been added.
            for token in set(new_tags) - set(old_tags):
                # Determine the actual field and tags
                if ':' in token:
                    field, value = token.split(':', 1)
                else:
                    field, value = 'tag', token
                value = value.strip('"')
                # Create records for all selected files.
                for file in files:
                    try:
                        Metadata(file=file, field=field, value=value).save()
                    except IntegrityError:
                        pass

            # Remove any tags that were removed.
            removed_tags = list(set(old_tags) - set(new_tags))
            # Add any new tags that have been added.
            for token in set(old_tags) - set(new_tags):
                # Determine the actual field and tags
                if ':' in token:
                    field, value = token.split(':', 1)
                else:
                    field, value = 'tag', token
                # Create records for all selected files.
                value = value.strip('"')
                Metadata.delete().where(Metadata.file << files,
                                        Metadata.field == field,
                                        Metadata.value == value).execute()
        # Repaint the tag list.
        self.update_tags()
Exemple #9
0
 def handle_backspace_key(self):
     confirmDialog = wx.MessageDialog(self,
                                      "Remove %s Files?" % self.thumbnailGrid.SelectedItemCount,
                                      "Remove Files?",
                                      style=wx.OK | wx.CANCEL)
     confirmDialog.SetOKLabel("Delete")
     if confirmDialog.ShowModal() == wx.ID_OK:
         files = File.select().where(File.md5 << [item.Text for item in self.get_selected_thumbs()])
         for file in files:
             file.delete_instance(recursive=True)
         # Repaint the tag list.
         self.update_tags()
         self.update_thumbnails()
         self.preview = None
Exemple #10
0
 def handle_backspace_key(self):
     confirmDialog = wx.MessageDialog(self,
                                      "Remove %s Files?" %
                                      self.thumbnailGrid.SelectedItemCount,
                                      "Remove Files?",
                                      style=wx.OK | wx.CANCEL)
     confirmDialog.SetOKLabel("Delete")
     if confirmDialog.ShowModal() == wx.ID_OK:
         files = File.select().where(
             File.md5 << [item.Text for item in self.get_selected_thumbs()])
         for file in files:
             file.delete_instance(recursive=True)
         # Repaint the tag list.
         self.update_tags()
         self.update_thumbnails()
         self.preview = None
Exemple #11
0
    def handle_T_key(self):
                # Get the File objects corresponding to the selected thumbnails
        files = File.select().where(File.md5 << [item.Text for item in self.get_selected_thumbs()])

        # Determine the existing tags for these files.
        old_tags = Metadata.filter(Metadata.file << files, Metadata.field.not_in(['import-time']))
        old_tags = sorted(list(set(['%s:"%s"' % (t.field, t.value) for t in old_tags])))

        dialog = wx.TextEntryDialog(None, "Tags:", "Modifiy Tags", value=", ".join(old_tags))
        if dialog.ShowModal() == wx.ID_OK:

            # Determine the new tags for these files.
            new_tags = dialog.GetValue()
            new_tags = [t.strip() for t in new_tags.split(",")]

            # Add any new tags that have been added.
            for token in set(new_tags) - set(old_tags):
                # Determine the actual field and tags
                if ':' in token:
                    field, value = token.split(':', 1)
                else:
                    field, value = 'tag', token
                value = value.strip('"')
                # Create records for all selected files.
                for file in files:
                    try:
                        Metadata(file=file, field=field, value=value).save()
                    except IntegrityError:
                        pass

            # Remove any tags that were removed.
            removed_tags = list(set(old_tags) - set(new_tags))
            # Add any new tags that have been added.
            for token in set(old_tags) - set(new_tags):
                # Determine the actual field and tags
                if ':' in token:
                    field, value = token.split(':', 1)
                else:
                    field, value = 'tag', token
                # Create records for all selected files.
                value = value.strip('"')
                Metadata.delete().where(Metadata.file << files,
                                        Metadata.field == field,
                                        Metadata.value == value).execute()
        # Repaint the tag list.
        self.update_tags()
Exemple #12
0
def folder(folder_name):
    try:
        f = Folder.get(name=folder_name)
    except peewee.DoesNotExist:
        return jsonify(message='error'), 404

    if request.method == 'POST':
        file = request.files['file']
        if file:
            actual_filename = secure_filename(
                folder_name + '_' + file.filename)
            if os.path.exists(os.path.join(app.config['UPLOAD_FOLDER'], actual_filename)):
                return jsonify(message='error'), 409
            file.save(
                os.path.join(app.config['UPLOAD_FOLDER'], actual_filename))
            f2 = File.create(folder=folder_name,
                             filename=file.filename,
                             public_share_url=generate_url(),
                             private_share_url=generate_url(),
                             private_share_password=generate_password(),
                             open_public_share=False,
                             open_private_share=False)
            f2.save()
            return jsonify(message='OK'), 201

    if request.method == 'GET':
        files = File.select().where(File.folder == folder_name)
        items = [{
            'filename': x.filename,
            'public': x.public_share_url,
            'private': x.private_share_url,
            'password': x.private_share_password,
            'openPublic': x.open_public_share,
            'openPrivate': x.open_private_share
        } for x in files]

        return jsonify(message='OK', items=items)

    if request.method == 'DELETE':
        try:
            f.delete_instance()
        except peewee.IntegrityError:
            return jsonify(message='error'), 409
        return jsonify(message='OK')
Exemple #13
0
    def __init__(self, parent):
        super(type(self), self).__init__(parent)
        self.add_dir = "~/Pictures"
        self._preview = None  # Track the currently selected image, by md5.
        self._filter = ""  # Track the current Filter
        self.filters = []  # Track previous Filters.

        # Only update the preview when we stop changing the list selection
        self.list_change_timer = wx.Timer(self)

        # Load the Grid
        self.thumbnail_index = {}
        self.thumbnails = wx.ImageList()
        self.thumbnailGrid.SetImageList(self.thumbnails, wx.IMAGE_LIST_NORMAL)
        for f in File.select():
            self.thumbnail_index[f.md5] = self.thumbnails.Add(f.as_bitmap())
        self.update_thumbnails()
        self.SetStatusText("Matching Images: %s" % self.thumbnailGrid.ItemCount)
        self.update_tags()
Exemple #14
0
    def __init__(self, parent):
        super(type(self), self).__init__(parent)
        self.add_dir = "~/Pictures"
        self._preview = None  # Track the currently selected image, by md5.
        self._filter = ""  # Track the current Filter
        self.filters = []  # Track previous Filters.

        # Only update the preview when we stop changing the list selection
        self.list_change_timer = wx.Timer(self)

        # Load the Grid
        self.thumbnail_index = {}
        self.thumbnails = wx.ImageList()
        self.thumbnailGrid.SetImageList(self.thumbnails, wx.IMAGE_LIST_NORMAL)
        for f in File.select():
            self.thumbnail_index[f.md5] = self.thumbnails.Add(f.as_bitmap())
        self.update_thumbnails()
        self.SetStatusText("Matching Images: %s" %
                           self.thumbnailGrid.ItemCount)
        self.update_tags()
Exemple #15
0
def NOT(token):
    return set(File.select().where(File.id.not_in([f.id for f in token])))
Exemple #16
0
def get_files_with_point(point):
    query = (File.select().where(File.point == point).order_by(File.order_num))
    return query
Exemple #17
0
def fetch_files(args):
    if args.authorization is None and args.fingerprint is None:
        raise ApiError(status=401)

    if args.after is not None and args.before is not None:
        raise ApiError('Choose between before or after, not both')

    if args.mimetype is not None and args.type is not None:
        raise ApiError('Choose Mimetype or Type, not both')

    mtype = None
    if args.type is not None:
        mtype = MimetypeType.get_or_none(id=args.type)
        if mtype is None:
            raise ApiError(metadata={'errors': {'type': 'Invalid Type'}})

    mimetype = None
    if args.mimetype is not None:
        mimetype = Mimetype.get_or_none(id=args.mimetype)
        if mimetype is None:
            raise ApiError(
                metadata={'errors': {
                    'mimetype': 'Invalid Mimetype'
                }})

    files = (File.select(File, FileHash).join(FileHash).switch(File).order_by(
        File.id.desc()).limit(args.limit))
    total = File.select(fn.COUNT(File.id).alias('count'))

    if args.authorization is not None:
        user = helpers.get_user(args.authorization)
        files = files.where(File.user == user)
        total = total.where(File.user == user)
    else:
        files = files.where(File.fingerprint == args.fingerprint)
        total = total.where(File.fingerprint == args.fingerprint)

    if mtype is not None:
        files = (files.join(Mimetype,
                            JOIN.RIGHT_OUTER,
                            on=(Mimetype.id == File.mimetype)).join(
                                MimetypeType,
                                JOIN.RIGHT_OUTER,
                                on=(MimetypeType.id == Mimetype.type)).where(
                                    MimetypeType.id == mtype).switch(File))
        total = (total.join(Mimetype,
                            JOIN.RIGHT_OUTER,
                            on=(Mimetype.id == File.mimetype)).join(
                                MimetypeType,
                                JOIN.RIGHT_OUTER,
                                on=(MimetypeType.id == Mimetype.type)).where(
                                    MimetypeType.id == mtype).switch(File))

    if mimetype is not None:
        files = files.where(File.mimetype == mimetype)
        total = total.where(File.mimetype == mimetype)

    if args.after is not None:
        files = files.where(File.id > args.after)
    elif args.before is not None:
        files = files.where(File.id < args.before)

    return ApiResponse({
        'total': total.scalar(),
        'files': [x.to_dict() for x in files],
    })
Exemple #18
0
    def run(self, i=1):
        log.info("Starting tag process.")
        files = File.select(title_word_count=None,
                            word_count=None).order_by(File.publish_date)
        rnn = rnntagger.RnnTagger("german")

        log.debug(f"Found {len(files)} files to tag.")
        for file in files:
            log.debug(f"Tagging file {i}")
            file_start = time.time()

            link = file.link

            if file.insert_date < datetime.now() - timedelta(days=365):
                continue

            if not link.startswith('http://') and not link.startswith(
                    'https://'):
                log.info(f"Invalid link given. Skipping. {link}")
                continue

            try:
                p = parser.Parser(link)
                text = p.get_text()
                size = int(p.get_size())
                pages = int(p.get_pages())
            except Exception as e:
                log.info(e)
                continue

            log.debug(f"Size: {size}")

            # roughly 40 seconds
            keywords_content = []
            try:
                log.info(f"Start tagging in {len(text) +1} steps.")
                for t in text:
                    keywords_content += rnn.tag(t)
                    log.debug("Next step.")
                keywords_title = rnn.tag(file.title)
            except rnntagger.TaggerError as e:
                log.info(f"Skipping. Tagging failed with: {e}")
                continue

            log.debug("Sorting keywords")
            keywords_content_sorted = self.sort(keywords_content)
            keywords_title_sorted = self.sort(keywords_title)

            log.debug(
                f"Found {len(keywords_content)} words. Start inserting them.")
            tot_words = 0
            for word, tags in keywords_content_sorted.items():
                for tag, word_count in tags.items():
                    k_id = self._get_keyword_id(word, tag)
                    if k_id is -1:
                        continue
                    FileKeywordContent(file_id=file.id,
                                       keyword_id=k_id,
                                       word_count=word_count)
                    tot_words += word_count

            tot_title = 0
            log.debug(
                f"Found {len(keywords_title)} words. Start inserting them.")
            for word, tags in keywords_title_sorted.items():
                for tag, word_count in tags.items():

                    k_id = self._get_keyword_id(word, tag)
                    if k_id is -1:
                        continue
                    f = FileKeyword(file_id=file.id,
                                    keyword_id=k_id,
                                    word_count=word_count)
                    tot_title += word_count

            file.file_size = size
            file.word_count = tot_words
            file.title_word_count = tot_title
            file.pages = pages
            log.debug(f"File DUR: {time.time()-file_start}")
            i += 1

            if i == 5:
                log.debug("Flushing session to avoid data loss.")
                self.run(i)

        log.info("Finished tagging. Nothing left to do.")
        return True
Exemple #19
0
def HAS(_, token):
    return set(File.select().join(Metadata, peewee.JOIN_LEFT_OUTER).where(Metadata.field == token))
    def run(self, files=0):
        if self.offset > 150:
            with db_session:
                this_scrape = Scrape.select(status="started").order_by(desc(Scrape.start)).first()
                if this_scrape is not None:
                    this_scrape.status = "finished"
                    this_scrape.end = datetime.now()
                    this_scrape.processed_files = files
                log.info("To far back in time. Stopping.")
            return

        try:
            page = self.get_page()
        except resource_error.ResourceError as e:
            log.error(e)
            return

        with db_session:
            try:
                last_scrape = Scrape.select(status="finished").order_by(desc(Scrape.start)).first
                runtil_date = last_scrape.end - timedelta(days=1)
            except:
                last_scrape = None
                runtil_date = datetime.now() - timedelta(days=30)

            if self.offset is 0:
                this_scrape = Scrape(start=datetime.now(), status="started")

        for entry in page.split('<hr class="col-xs-12 divider divider-small">'):
            log.debug(f"Next file ({files}) ...")
            try:
                with db_session:
                    soup = BeautifulSoup(entry, 'html.parser')
                    title = soup.a.get_text()

                    if len(title) > 160:
                        short_title = title[:157] + "..."
                    else:
                        short_title = title

                    # end of page or invalid title
                    if "Weitere Dokumente anzeigen" in title or len(title) < 5:
                        log.debug("EOF or title invalid. Skipping.")
                        continue

                    try:
                        info = soup.find_all('li')
                        publish_date = datetime.strptime(info[1].getText().replace("Datum: ", "", 1), "%d.%m.%Y")
                        number = info[0].getText()
                        type = info[2].getText().replace("Art: ", "", 1)
                        author = info[3].getText().replace("Urheber: ", "", 1)
                        link = 'https://www.landtag-bw.de'+soup.a.get('href') # TODO move base url to config
                        insert_date = datetime.now()
                    except Exception as e:
                        log.info(f"Could not parse file. Skipping. {e}")
                        continue

                    if publish_date < runtil_date:
                        log.info("Reach end of last scrape, finished.")
                        this_scrape = Scrape.select(status="started").order_by(desc(Scrape.start)).first()
                        if this_scrape is not None:
                            this_scrape.status = "finished"
                            this_scrape.end = datetime.now()
                            this_scrape.processed_files = files
                            this_scrape.flush()
                        return

                    # check if file already exists
                    file_ = File.select(title=title, number=number).first()
                    if file_ is not None:
                        # TODO update file in database
                        log.info(f"Already having file \"{title}\" in database. Skipping.")
                        continue

                    file = File(title=title, title_short=short_title, number=number, publish_date=publish_date,
                                type=type, author=author, insert_date=insert_date,
                                link=link)
                    files += 1
            except Exception as e:
                log.error(f"Fatal error, file might not be saved: {e}")

        log.debug("EOF. Getting next page.")
        self.offset += 30
        self.run(files=files)
Exemple #21
0
def UNTAGGED():
    tagged_files = Metadata.select(Metadata.file).where(Metadata.field != 'import-time')
    return set(File.select().where(File.id.not_in(tagged_files)))