Exemple #1
0
def identify_date_with_over_allocation(df, Resource, list):
    daterange = pd.date_range(datetime.today(), datetime.strptime('2020-12-31', "%Y-%m-%d"))
    # Lets find days that we have over allocated
    column_names = ['Date', 'Count']
    Utilizationdf = pd.DataFrame(columns=column_names)
    for single_date in daterange:
        holddf = df[(df['Start_Date'] <= single_date.date()) &
                    (df['Finish_Date'] >= single_date.date())]
        Utilizationdf = Utilizationdf.append({'Date': single_date.date(), 'Count': len(holddf)}, ignore_index=True)
    mean = round(np.mean(Utilizationdf['Count'], 0))
    print(' '.join(['minimum = ', str(round(np.amin(Utilizationdf['Count'], 0)))]))
    print(' '.join(['mean =', str(mean)]))
    print(' '.join(['maximum = ', str(round(np.amax(Utilizationdf['Count'], 0)))]))
    std = round(np.std(Utilizationdf['Count'], 0))
    print(' '.join(['Standard Deviation =', str(std)]))
    Overutilizationdf = Utilizationdf[(Utilizationdf['Count'] > mean + std)]
    underutilizationdf = Utilizationdf[(Utilizationdf['Count'] < mean - std)]

    print('Allocation is higher than mean + 1 standard deviation for ' + df.iloc[
        0].Other_Activity_Resource + ' on the following dates:')
    print(Overutilizationdf)
    Overutilizationdf.to_csv(sanitize('_'.join([str(Resource), str(list), 'Overutilizationdf.csv'])))
    print('Allocation is lower than mean - 1 standard deviation for ' + df.iloc[
        0].Other_Activity_Resource + ' on the following dates:')
    underutilizationdf.to_csv(sanitize('_'.join([str(Resource), str(list), 'underutilizationdf.csv'])))
    print(underutilizationdf)
    def export(retrieved_annotations: dict, directory: str) -> None:
        for author in retrieved_annotations:
            author_directory = "{}/{}/".format(directory, sanitize(author))
            Path(author_directory).mkdir(parents=True, exist_ok=True)
            books = retrieved_annotations[author]
            for book in books:
                book_file_name = "{}/{}.docx".format(author_directory, sanitize(book))
                document = Document()
                document.add_heading(book, level=1)
                document.add_paragraph(author, style='Caption', )
                p_blank = document.add_paragraph("")
                p_blank.line_spacing_rule = WD_LINE_SPACING.DOUBLE

                chapters = books[book]
                for chapter in chapters:
                    document.add_paragraph(chapter, style='Title')
                    p_blank = document.add_paragraph("")
                    p_blank.line_spacing_rule = WD_LINE_SPACING.DOUBLE
                    annotations = chapters[chapter]
                    for annotation in annotations:
                        comment = ''
                        if annotation.comment is not None and annotation.comment:
                            comment = annotation.comment
                        p_annotation = document.add_paragraph(annotation.text, style='Intense Quote')
                        p_annotation.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
                        p_comment = document.add_paragraph(comment, style='No Spacing')
                        p_comment.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
                        p_blank = document.add_paragraph("")
                        p_blank.line_spacing_rule = WD_LINE_SPACING.DOUBLE

                document.save(book_file_name)
Exemple #3
0
def put_music(music, response, music_folder=None, iter=None):
    if (music_folder is None):
        path = './files/music/{0}'.format(music['id'])
    else:
        path = '{0}/{1}'.format(music_folder, music['id'])
    mkdir_if_not_exists(path)

    file_name = music['title']
    file_name = file_name.replace('/', '|')
    file_name = sanitize(file_name)
    file_path = '{0}/{1}.mp3'.format(path, file_name)
    file = open(file_path, 'wb')
    file.write(response.content)
    file.close()

    print(file_path)
    print(music['title'])

    tag = id3.Tag()
    tag.parse(file_path)
    try:
        tag.version = id3.ID3_DEFAULT_VERSION
    except:
        tag.version = id3.ID3_V1
    artist = tag.artist
    title = tag.title

    if title is None:
        tag.title = music['title']
    if artist is None:
        tag.artist = music['artist']
    if len(music['track_covers']):
        curr_images = [y.description for y in tag.images]
        for image in curr_images:
            tag.images.remove(image)
        img_url = music['track_covers'][len(music['track_covers']) - 1]
        r = requests.get(img_url)
        tag.images.set(3, r.content, 'image/jpeg')

    track_name = '{0}-{1}-{2}.mp3'.format(
        tag.artist, tag.album if tag.album is not None else '', tag.title)
    tag.title = track_name

    try:
        tag.save()
    except:
        tag.version = id3.ID3_V1
        tag.save()

    track_name = track_name.replace('--', '-')
    track_name = track_name.replace('--', '-')
    track_name = sanitize(track_name)

    new_track_path = '{0}/{1}'.format(path, track_name)
    os.rename(file_path, new_track_path)
Exemple #4
0
def main():
    print(request.method)
    if request.method == 'GET':
        return render_template('index.html')
    elif request.method == 'POST':
        # to convert the images to pdf
        # extract file data from the request
        files = request.files.getlist("file[]")
        if files == []: abort(400, 'Provide at least one image')

        # use in all the names
        pdf_name = str(uuid.uuid4())[:18]  # save with uuid_name
        folder_to_save = os.path.join(app.config['UPLOAD_FOLDER'], pdf_name)

        # file sanitization check
        for file in files:
            if sanitize(file.filename).rsplit('.', 1)[1].upper() not in [
                    'PNG', 'JPG', 'JPEG', 'JIFF', 'TIFF'
            ]:
                abort(400, 'Wrong file type')

        # creating the folder to save the file
        if not os.path.exists(folder_to_save): os.mkdir(folder_to_save)

        # saving the images
        for image in files:
            image.save(os.path.join(folder_to_save, sanitize(image.filename)))

        # formation of pdf get_data from - folder_to_save
        pdf_size = pdf.create_compressed_pdf(folder_to_save, pdf_name)
        # save in app.config['UPLOAD_FOLDER']

        # save the pdf and delete the folder
        if os.path.exists(folder_to_save): shutil.rmtree(folder_to_save)

        # return send_from_directory(directory = app.config['UPLOAD_FOLDER'] ,filename = pdf_name)
        # (directory = app.config['UPLOAD_FOLDER'] ,filename = pdf_name)
        # try : return send_from_directory(app.config['UPLOAD_FOLDER'],pdf_name, as_attachment=True)
        # return send_file(BytesIO(os.path.join(app.config['UPLOAD_FOLDER'],pdf_name) ),as_attachment=True )
        file_path = os.path.join(app.config['UPLOAD_FOLDER'],
                                 pdf_name + '.pdf')
        try:
            return send_file(file_path,
                             mimetype='application/pdf',
                             attachment_filename="Your_small_pdf.pdf",
                             as_attachment=True)
        except:
            return jsonify(pdf_name, pdf_size)
Exemple #5
0
    def validate_name(self, name):
        parent = self.context['data']['folder']
        method = self.context['request'].method

        if method in ('PUT', 'PATCH'):

            def validator(name):
                return (Folder.objects.filter(
                    folder=parent,
                    name=name).exclude(pk=self.instance.id).exists()
                        or File.objects.filter(
                            folder=parent,
                            name=name).exclude(pk=self.instance.id).exists())

        else:

            def validator(name):
                return (
                    Folder.objects.filter(folder=parent, name=name).exists()
                    or File.objects.filter(folder=parent, name=name).exists())

        if validator(name):
            i = 2
            base, ext = os.path.splitext(name)
            new_name = f'{base} ({i}){ext}'
            while validator(new_name):
                i += 1
                new_name = f'{base} ({i}){ext}'
            name = new_name

        return sanitize(name)
    def parse(self, response, **kwargs):
        pagina = {"title": None, "url": response.url, "content": None}

        contentType = response.headers["Content-Type"].decode("utf-8").lower()
        if contentType.find("html") != -1:
            page = response.url.split("/")[-2]
            title = response.xpath('//main/section/article/h1/text()').get()
            pagina["title"] = title
            print(title)
            self.log(f'Visited page {title}, {page}')

            # Cerco di prendere solo gli articoli che contengono paragrafi (e quindi probabilmente testo utile)
            articleContent = response.xpath(
                "//div[contains(@class, 'article-content') and p]").get()
            if articleContent:
                filename = sanitize(title).lower().replace(' ', '-')
                filename = self.documentsDir + filename + ".json"
                try:
                    with open(filename, 'x') as file:
                        #Parsing dal contenuto html a testo (markdown)
                        contentToText = self.text_maker.handle(articleContent)
                        pagina["content"] = contentToText
                        json.dump(pagina, file)
                        self.log(f'Saved file {filename}')
                except FileExistsError:
                    self.log(f'file {filename} already exists, skipping')

            for href in response.xpath(
                    "//main//a[not(contains(@class, 'bread-parent'))]/@href"
            ).getall():
                #Evito le chiamate alla stessa pagina e ad ancore nella stessa pagina
                if href != '/' and href != response.url and href[0] != '#':
                    yield scrapy.Request(response.urljoin(href), self.parse)

        elif contentType == "application/pdf":
            page = response.url.split("/")[-1]
            filename = sanitize(page).lower().replace(' ', '-')
            filename = self.documentsDir + filename
            if filename.find(".pdf") == -1:
                filename += ".pdf"
            try:
                with open(filename, 'xb') as file:
                    # Scrittura del pdf scaricato
                    file.write(response.body)
                    self.log(f'Saved file {filename}')
            except FileExistsError:
                self.log(f'file {filename} already exists, skipping')
Exemple #7
0
def main():
    Project_Data_Filename = 'Metro West PETE Schedules.xlsx'

    """ Main entry point of the app """
    logger.info("Starting Resource Tracker")
    Change_Working_Path('./Data')
    try:
        Project_Data_df = Excel_to_Pandas(sanitize(Project_Data_Filename))
    except:
        logger.error('Can not find Project Data file')
        raise

    Project_Data_df['Start_Date'] = Project_Data_df['Start_Date'].dt.date
    Project_Data_df['Finish_Date'] = Project_Data_df['Finish_Date'].dt.date

    Project_Data_df = Project_Data_df.sort_values(by=['Start_Date'])

    # Stats about Other Activity Resource

    districts = ['AMARILLO', 'BIG SPRING', 'FORT WORTH', 'GRAHAM', 'ODESSA', 'SWEETWATER', 'WICHITA FALLS']
    # districts = ['FORT WORTH']
    # districts = planneddf.Other_Activity_Resource.dropna().unique()
    # list=['Without Assumptions', 'With Assumptions']
    list = ['']
    for item in list:
        for district in districts:

            if item == 'With Assumptions':
                Project_Data_df.loc[((Project_Data_df['Grandchild'] == 'Electrical Job Planning') &
                                     (pd.isnull(Project_Data_df['Other_Activity_Resource'])) &
                                     (Project_Data_df['Work_Center_Name'] == district)), [
                                        'Other_Activity_Resource']] = 'Ft. Worth P&C Crews'

                Project_Data_df.loc[(Project_Data_df['Grandchild'] == 'Electrical Construction') &
                                    (pd.isnull(Project_Data_df['Other_Activity_Resource'])) &
                                    (Project_Data_df['Work_Center_Name'] == district), [
                                        'Other_Activity_Resource']] = 'Ft. Worth P&C Crews'

            planneddf, resourcemissingdf = filter_Prject_Data_By_Schedule(district, Project_Data_df)

            # if item == 'Without Assumptions':
            resourcemissingdf.to_excel(' '.join([district, 'Activities missing resources.xlsx']), district, index=False,
                                       engine='xlsxwriter')

            print_district_stats(planneddf, resourcemissingdf, district)

            for Resource in planneddf.Other_Activity_Resource.dropna().unique():
                print(Resource)
                DATADF = planneddf[(planneddf['Other_Activity_Resource'] == Resource) &
                                   (planneddf['Finish_Date'] > pd.Timestamp(datetime.now())) &
                                   (planneddf['Finish_Date'] <= pd.Timestamp(
                                       datetime.strptime(str(find_the_counstrunction_Season(datetime.today())),
                                                         "%Y-%m-%d")))]

                if DATADF.size > 0:
                    make_gnat(DATADF, ' '.join([DATADF['Other_Activity_Resource'].values[0], 'Utilization', item]))
                    identify_date_with_over_allocation(DATADF, Resource, item)
Exemple #8
0
 def export(self, retrieved_annotations: dict, directory: str) -> None:
     for author in retrieved_annotations:
         author_directory = "{}/{}/".format(directory, sanitize(author))
         Path(author_directory).mkdir(parents=True, exist_ok=True)
         books = retrieved_annotations[author]
         for book in books:
             book_file_name = "{}/{}.txt".format(author_directory,
                                                 sanitize(book))
             book_file = Path(book_file_name).open(mode="a",
                                                   encoding="utf-16")
             chapters = books[book]
             for chapter in chapters:
                 book_file.write("\n\n{}\n".format(chapter))
                 annotations = chapters[chapter]
                 for annotation in annotations:
                     comment = ''
                     if annotation.comment is not None and annotation.comment:
                         comment = "({})".format(annotation.comment)
                     last_update = "[{}]".format(annotation.last_update)
                     book_file.write("{} - {}: {}\n\n".format(
                         last_update, comment, annotation.text))
                 book_file.write("\n")
Exemple #9
0
def process_caption(caption,
                    lecture_index,
                    lecture_title,
                    lecture_dir,
                    tries=0):
    filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(lecture_title),
                                  caption.get("locale_id"), caption.get("ext"))
    filename_no_ext = f"%s. %s_%s" % (lecture_index, sanitize(lecture_title),
                                      caption.get("locale_id"))
    filepath = f"%s\\%s" % (lecture_dir, filename)

    if os.path.isfile(filepath):
        print("> Captions '%s' already downloaded." % filename)
    else:
        print(f"> Downloading captions: '%s'" % filename)
        try:
            download(caption.get("url"), filepath, filename)
        except Exception as e:
            if tries >= 3:
                print(
                    f"> Error downloading captions: {e}. Exceeded retries, skipping."
                )
                return
            else:
                print(
                    f"> Error downloading captions: {e}. Will retry {3-tries} more times."
                )
                process_caption(caption, lecture_index, lecture_title,
                                lecture_dir, tries + 1)
        if caption.get("ext") == "vtt":
            try:
                print("> Converting captions to SRT format...")
                convert(lecture_dir, filename_no_ext)
                print("> Caption conversion complete.")
                os.remove(filepath)
            except Exception as e:
                print(f"> Error converting captions: {e}")
Exemple #10
0
def parse(data):
    course_dir = f"%s\\%s" % (download_dir, course_id)
    if not os.path.exists(course_dir):
        os.mkdir(course_dir)
    chapters = []
    lectures = []

    for obj in data:
        if obj["_class"] == "chapter":
            obj["lectures"] = []
            chapters.append(obj)
        elif obj["_class"] == "lecture" and obj["asset"][
                "asset_type"] == "Video":
            try:
                chapters[-1]["lectures"].append(obj)
            except IndexError:
                # This is caused by there not being a starting chapter
                lectures.append(obj)
                lecture_index = lectures.index(obj) + 1
                lecture_path = f"%s\\%s. %s.mp4" % (course_dir, lecture_index,
                                                    sanitize(obj["title"]))
                process_lecture(obj, lecture_index, lecture_path, download_dir)

    for chapter in chapters:
        chapter_dir = f"%s\\%s. %s" % (course_dir, chapters.index(chapter) + 1,
                                       sanitize(chapter["title"]))
        if not os.path.exists(chapter_dir):
            os.mkdir(chapter_dir)

        for lecture in chapter["lectures"]:
            lecture_index = chapter["lectures"].index(lecture) + 1
            lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index,
                                                sanitize(lecture["title"]))
            process_lecture(lecture, lecture_index, lecture_path, chapter_dir)
    print("\n\n\n\n\n\n\n\n=====================")
    print("All downloads completed for course!")
    print("=====================")
    def validate_name(self, name):
        parent = self.context['data']['folder']

        if (Folder.objects.filter(folder=parent, name=name).exists()
                or File.objects.filter(folder=parent, name=name).exists()):
            i = 2
            new_name = f'{name} ({i})'
            while (Folder.objects.filter(folder=parent,
                                         name=new_name).exists() or
                   File.objects.filter(folder=parent, name=new_name).exists()):
                i += 1
                new_name = f'{name} ({i})'
            name = new_name

        return sanitize(name)
Exemple #12
0
def make_gnat(df, title):
    labels = df.PETE_ID.apply(str) + ' - ' + df.Grandchild.apply(str)
    length = len(df.index)
    ticks = []

    for x in range(length):
        ticks.append((x + 1) / length * 100)

    # Declaring a figure "gnt"
    fig, gnt = plt.subplots(figsize=(19, 15))

    fig.suptitle(title, fontsize=16)

    # Setting Y-axis limits
    gnt.set_ylim(0, ticks[-1] + ticks[0])

    # Setting X-axis limits

    gnt.set_xlim(date.today(), datetime.strptime('2020-12-31', "%Y-%m-%d"))

    # Setting labels for x-axis and y-axis

    # Setting ticks on y-axis
    gnt.set_yticks(ticks)

    # Labelling tickes of y-axis
    gnt.set_yticklabels(labels)

    # Setting graph attribute
    gnt.grid(True)
    gnt.xaxis_date()

    # Declaring a bar in schedule
    # gnt.barh([ticks[0]-ticks[0]/2, (df.Finish_Date.values[0] - df.Start_Date.values[0]), left=df.Start_Date.values[0], height=ticks[0], align='center', color='orange', alpha = 0.8)
    for x in range(len(df.index)):
        if df['Start_Date_Planned\Actual'].values[x] == 'A':
            gnt.barh(ticks[x], (df.Finish_Date.values[x] - df.Start_Date.values[x]),
                     left=df.Start_Date.values[x], height=ticks[0] / 2, align='center', color='maroon', alpha=0.8)
        else:
            gnt.barh(ticks[x], (df.Finish_Date.values[x] - df.Start_Date.values[x]),
                     left=df.Start_Date.values[x],
                     height=ticks[0] / 2, align='center', color='red', alpha=0.8)

    fig.autofmt_xdate()
    plt.tight_layout()
    plt.savefig('Output/' + sanitize(title) + '.png')
    async def download_and_store(
            self,
            url_object: list,
            session: aiohttp.ClientSession,
            headers: Optional[CaseInsensitiveDict] = None,
            show_progress: bool = True
    ) -> None:
        """Download the content of given URL and store it in a file."""
        url = url_object[0]
        referal = url_object[1]

        filename = sanitize(url.split("/")[-1])
        if (self.folder / self.title / filename).exists():
            logger.debug(str(self.folder / self.title / filename) + " Already Exists")
        else:
            logger.debug("Working on " + url)
            await self.download_file(url, referal=referal, filename=filename, session=session, headers=headers, show_progress=show_progress)
            await self.rename_file(filename)
Exemple #14
0
def print_jsoncsl(refs):
    csls = []
    for ref in refs:
        csl = {
            'id'              : sanitize(ref['objet'] + '-' + ref['ID']),
            'type'            : 'article-newspaper' if ref['type'] == 'Tribune' else 'personal_communication',
            'letterType'      : ref['type'],
            'abstract'        : ref['catchphrase'],
            'container-title' : ref['publication'],
            'title'           : ref['titre'],
            'URL'             : ref['url'],
            'author'          : [{'literal':ref['auteurs']}],
            'recipient'       : [{'literal':ref['destinataire']}],
            'issued'          : {'raw':ref['date']},
            'note'            : ref['position']
            }
        csls.append(csl)

    print(json.dumps(csls, indent=4, ensure_ascii=False))
Exemple #15
0
def fingerprinter_upload(request):
    processed_files = []

    pdf_file = request.FILES.get('pdf-file')
    copy_count = request.POST.get('copy-count', 1)
    suffix = request.POST.get('file-suffix', '')

    try:
        copy_count = int(copy_count)
    except:
        copy_count = 1

    if pdf_file is not None:

        s = os.path.splitext(pdf_file.name)
        filename = s[0].replace("'", '').replace('"', '')
        extension = s[-1]

        if extension.lower() != '.pdf':
            raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406

        #make save directory 
        rand_path = randword(9)
        save_path = os.path.join('/tmp/', rand_path)
        os.makedirs(save_path)

        filename = sanitize(filename)

        filename = filename.replace("'", '').replace('"', '')
        filename = re.sub(r"[\(,\),\s]+", "-", filename)

        save_temp_file(filename, pdf_file, subdir=rand_path)

        #trigger fingerprint task
        task_id = refingerprint_pdf.delay(filename, rand_path, copy_count, suffix)

        data = {'directory': rand_path, 'filename': filename, 'task_id': str(task_id)}

        return JsonResponse(data)

    else:
        raise Http404('file not provided')
Exemple #16
0
    def run(self):
        if len(self.apiKey) < 32: raise ConfigError('API key not set.')
        if len(self.appToken) < 32:
            # https://trello.com/app-key
            # Request a read-only application token which never expires.
            url = 'https://trello.com/1/authorize?key={}&name=My+Backup+App&expiration=never&response_type=token&scope=read'.format(
                self.apiKey)
            print(
                'Application token not set. Please visit {} in your browser to create an application token.'
                .format(url))
            return

        # Use backup time as folder name.
        self.backupFolder = 'backups/{}'.format(
            sanitize(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

        # Fetch all boards.
        response = requests.get(
            "https://api.trello.com/1/members/me/boards?key={}&token={}".
            format(self.apiKey, self.appToken))
        boards = json.loads(response.text)
        if not boards:
            raise APIError('Error fetching boards. ' + response.text)

        # Fetch boards in organizations.
        response = requests.get(
            "https://api.trello.com/1/members/me/organizations?key={}&token={}"
            .format(self.apiKey, self.appToken))
        orgs = json.loads(response.text)
        orgsDict = {}
        for org in orgs:
            response = requests.get(
                "https://api.trello.com/1/organizations/{}/boards?&key={}&token={}"
                .format(org['id'], self.apiKey, self.appToken))
            orgBoards = json.loads(response.text)
            if not orgBoards:
                raise APIError('Error fetching organization boards. ' +
                               response.text)
            boards.extend(orgBoards)
            orgsDict[org['id']] = org

        # Load content for each board and save it to file.
        for board in boards:
            if board['idOrganization'] != None:
                orgName = orgsDict[
                    board['idOrganization']]['displayName'] if board[
                        'idOrganization'] != None and board[
                            'idOrganization'] in orgsDict else board[
                                'idOrganization']
            else:
                orgName = "UNKNONWN"
            print('Fetching board {} in organization {}'.format(
                board['name'], orgName))
            fetchURL = 'https://api.trello.com/1/boards/{}?actions=all&actions_limit=1000&card_attachment_fields=all&cards=all&lists=all&members=all&member_fields=all&card_attachment_fields=all&checklists=all&fields=all&key={}&token={}'.format(
                board['id'], self.apiKey, self.appToken)
            response = requests.get(fetchURL)
            jsonObj = json.loads(response.text)
            if not jsonObj:
                raise APIError('Error fetching the content of board "{}". '.
                               format(board['name']) + response.text)
            fileName = sanitize('org-{}-board-{}.json'.format(
                orgName, board['name']))
            self.save(fileName, response.text)

            if self.backupAttachments:
                for action in jsonObj['actions']:
                    # There is attachment data and the attachment has url.
                    if 'attachment' in action['data'] and 'url' in action[
                            'data']['attachment']:
                        attachment = action['data']['attachment']
                        print('>>>>Fetching attachment {}: {}'.format(
                            attachment['id'], attachment['name']))
                        response = requests.get(attachment['url'])
                        fileName = sanitize('attachment-{}-{}'.format(
                            attachment['id'], attachment['name']))
                        self.save(fileName, response.text)

        print(
            'Done! {} trello boards have been downloaded and saved in "{}" folder.'
            .format(len(boards), self.backupFolder))
Exemple #17
0
    async def convert_to_pdf(self, md5: str, msg: Message):
        ack_msg = await msg.reply_text('About to convert book to PDF...',
                                       quote=True)
        book = await BookdlFiles().get_file_by_md5(md5=md5,
                                                   typ='application/pdf')
        if book:
            await BookDLBot.copy_message(chat_id=msg.chat.id,
                                         from_chat_id=book['chat_id'],
                                         message_id=book['msg_id'])
            await ack_msg.delete()
            return
        _, detail = await Util().get_detail(
            md5, return_fields=['mirrors', 'title', 'extension', 'coverurl'])

        temp_dir = Path.joinpath(Common().working_dir,
                                 Path(f'{ack_msg.chat.id}+{ack_msg.id}'))
        if not Path.is_dir(temp_dir):
            Path.mkdir(temp_dir)

        direct_links = await LibgenDownload().get_directlink(
            detail['mirrors']['main'])
        extension = detail['extension']
        params = {
            'File': direct_links[1],
            'PdfVersion': '2.0',
            'OpenZoom': '100',
            'PdfTitle': '@SamfunBookdlbot - ' + detail['title'],
            'RotatePage': 'ByPage'
        }
        stat_var = f"{ack_msg.chat.id}{ack_msg.id}"
        convert_status[stat_var] = {'Done': False}
        try:
            loop = asyncio.get_event_loop()
            convert_process = loop.run_in_executor(None, self.__convert,
                                                   params, extension, stat_var)
            start_time = time.time()
            while True:
                if convert_status[stat_var]['Done']:
                    break
                else:
                    try:
                        await ack_msg.edit_text(
                            f'Convertion to PDF started... {int(time.time() - start_time)}'
                        )
                    except MessageNotModified as e:
                        logger.error(e)
                    except FloodWait as e:
                        logger.error(e)
                        await asyncio.sleep(e.x)
                    await asyncio.sleep(2)
            Result = await convert_process
        except ApiError as e:
            logger.error(e)
            await ack_msg.edit_text(e)
            shutil.rmtree(temp_dir)
            return

        file_path = Path.joinpath(
            temp_dir,
            Path('[@SamfunBookdlbot] ' + sanitize(detail['title']) + '.pdf'))
        detail[
            'cost'] = f'ConvertAPI Cost: **{Result.conversion_cost}** seconds.'
        await ack_msg.edit_text(f'About to download converted file...')
        try:
            async with aiohttp.ClientSession() as dl_ses:
                async with dl_ses.get(Result.file.url) as resp:
                    total_size = int(Result.file.size)
                    file_name = Result.file.filename

                    async with aiofiles.open(file_path, mode="wb") as dl_file:
                        current = 0
                        logger.info(f'Starting download: {file_name}')
                        start_time = time.time()
                        async for chunk in resp.content.iter_chunked(1024 *
                                                                     1024):
                            await dl_file.write(chunk)
                            current += len(chunk)
                            if time.time() - start_time > 2:
                                await ack_msg.edit_text(
                                    f'Downloading: **{detail["title"]}**\n'
                                    f"Status: **{size.format_size(current, binary=True)}** of **{size.format_size(total_size, binary=True)}**"
                                )
                                start_time = time.time()
        except Exception as e:
            logger.exception(e)
            return None
        await Uploader().upload_book(file_path, ack_msg, md5, detail=detail)
from os import listdir, rename
from os.path import isfile, join
from sanitize_filename import sanitize

if __name__ == "__main__":
    target_dir = "./"

    onlyfiles = [f for f in listdir(target_dir) if isfile(join(target_dir, f))]
    for file in onlyfiles:
        sanitized = sanitize(file)
        if sanitized != file:
            # we need to rename the file
            rename(target_dir+file, target_dir+sanitized)
Exemple #19
0
def upload(request):
    if request.method == 'POST':

        file_ = request.FILES.get('pdf-file')

        processing_error = None

        if file_ is None:
            raise HTTPExceptions.NOT_ACCEPTABLE  #Error code 406

        filename = file_.name

        if not filename or len(filename) < 3 or not '.' in filename:
            raise SuspiciousFileOperation('improper file name')

        filename = sanitize(filename)

        filename = filename.replace("'", '').replace('"', '')
        filename = re.sub(r"[\(,\),\s]+", "-", filename)

        temp = filename.split('.')
        basename = '.'.join(temp[:-1])
        extension = temp[-1]

        if not extension in ('pdf', 'PDF'):
            raise SuspiciousFileOperation('improper file type')

        basename = basename[:60]

        new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension)

        #save to /tmp
        md5_hash, tempfile_path = save_temp_file(new_filename, file_)

        #file already exists in system?
        existing_name = check_ocr_file_exists(md5_hash)

        #already_has_text?
        if check_pdf_has_text(new_filename):
            processing_error = 'This PDF already has text. Use the "Force OCR" button to overwrite text with a fresh OCR if desired. If file was OCRd on previous upload those results will be provided'

        if not existing_name:
            already_exists = False

            #upload original to S3
            s3 = S3(settings.AWS_OCR_BUCKET)

            saved_file = open(tempfile_path, 'rb')

            s3.save_to_bucket(new_filename, saved_file)

            ref = OCRUpload(filename=new_filename,
                            md5_hash=md5_hash,
                            is_original=True)

            ref.save()

            cleanup_temp_file(new_filename)

        else:
            already_exists = True

            new_filename = existing_name

            cleanup_temp_file(new_filename)

        data = {
            'file_info': {
                'filename': filename,
                'size': file_.size,
                'new_filename': new_filename,
                'processing_error': processing_error,
                'tempfile_path': tempfile_path,
                'already_exists': already_exists,
                'md5_hash': md5_hash
            }
        }

        return JsonResponse(data)

    return HttpResponseNotAllowed(['POST,'])
Exemple #20
0
 def get_pod_file_name(self, pod):
     podPublishedOn = self.get_utc_date(pod.published)
     podExtension = self.get_pod_file_extension(pod)
     if "?" in podExtension:
         podExtension = podExtension.rpartition("?")[0]
     return sanitize(podPublishedOn.strftime("%Y-%m-%dT%H-%M-%SZ") + "_" + self.configSection + "_" + pod.title + "." + podExtension)
Exemple #21
0
def filename_for(name):
    return sanitize(name.translate(defang_bad_chars) + '.json')
Exemple #22
0
 def normalize(self):
     """[Update The extension and the week and path of the file]
     """
     self.ext = '.' + self.url.rsplit('.', 1)[1]
     self.week = sanitize(self.week)
     self.path = f'{DOWNLOADS_DIR}/{self.course}/{self.week}/{sanitize(self.name)+sanitize(self.ext)}'
Exemple #23
0
 def persist(setName: str, cards: List[Dict[str, Any]]) -> None:
   sanitizedSetName = sanitize(setName)
   filePath = f"{SetUtil.CARDS_DIR}{os.sep}{sanitizedSetName}"
   with open(filePath, 'w', encoding='utf-8') as f:
     f.write(json.dumps(cards))
Exemple #24
0
def ref2filename(ref):
    return(sanitize(
        ref['date']+'_'+slugify(ref['titre'])+'_'+slugify(ref['auteurs'])+'.'+ref['upload'][0]['ext']
    ))
Exemple #25
0
def upload(request):
    filename = ""
    if request.method == 'POST':
        file_ = request.FILES['file']

        filename = file_.name

        if not filename or len(filename) < 3 or not '.' in filename:
            raise SuspiciousFileOperation('improper file name')

        filename = sanitize(filename)

        filename = filename.replace("'", '').replace('"', '')
        filename = re.sub(r"[\(,\),\s]+", "-", filename)

        temp = filename.split('.')
        basename = '.'.join(temp[:-1])
        extension = temp[-1]

        basename = basename[:60]

        new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension)

        #save file to disk temporarily.
        #later it will be deleted after uploading to s3.
        md5_hash, tempfile_path = save_temp_file(new_filename, file_)

        extension = extension.lower()

        #if file (or processed child) exists, return the name
        existing_name = check_file_exists(md5_hash)

        if existing_name:
            cleanup_temp_file(new_filename)

            return HttpResponse(existing_name)

        #transform process if needed
        process_to_file_type = False

        if extension in ['doc', 'docx', 'odt', 'ott', 'rtf', 'odp', 'ppt', 'pptx']:
            process_to_file_type = 'pdf'

        if extension in ['xls', 'xlsx', 'ods']:
            process_to_file_type = 'csv' 

        if process_to_file_type:
            child_name = _soffice_process(
                    tempfile_path, new_filename, md5_hash, process_to_file_type)

            if child_name:
                cleanup_temp_file(child_name)

                return HttpResponse(child_name)

            else:
                cleanup_temp_file(child_name)
                raise HTTPExceptions.UNPROCESSABLE_ENTITY


        if extension == 'pdf':
            #check if is an image pdf or if it has text
            if not check_pdf_has_text(new_filename):
                cleanup_temp_file(new_filename)
                raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406


        #upload to cloud
        s3 = S3(settings.AWS_ANNOTATIONS_BUCKET)

        saved_file = open(tempfile_path, 'rb')

        s3.save_to_bucket(new_filename, saved_file)

        #save ref to db
        ref = FileUpload(filename=new_filename, md5_hash=md5_hash,
                extension=extension, is_original=True)

        ref.save()

        cleanup_temp_file(new_filename)

        return HttpResponse(new_filename)

    return HttpResponseNotAllowed(['POST,'])
def cleanForFileName(string):
    return sanitize_filename.sanitize(string)
Exemple #27
0
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir):
    lecture_title = lecture["title"]
    lecture_asset = lecture["asset"]
    if not skip_lectures:
        if lecture_asset["media_license_token"] == None:
            # not encrypted
            media_sources = lecture_asset["media_sources"]
            if quality:  # if quality is specified, try to find the requested quality
                lecture_url = next(
                    (x["src"]
                     for x in media_sources if x["label"] == str(quality)),
                    media_sources[0]["src"]
                )  # find the quality requested or return the best available
            else:
                lecture_url = media_sources[0][
                    "src"]  # best quality is the first index

            if not os.path.isfile(lecture_path):
                try:
                    download(lecture_url, lecture_path, lecture_title)
                except Exception as e:
                    # We could add a retry here
                    print(f"> Error downloading lecture: {e}. Skipping...")
            else:
                print(f"> Lecture '%s' is already downloaded, skipping..." %
                      lecture_title)
        else:
            # encrypted
            print(f"> Lecture '%s' has DRM, attempting to download" %
                  lecture_title)
            lecture_working_dir = "%s\%s" % (
                working_dir, lecture_asset["id"]
            )  # set the folder to download ephemeral files
            media_sources = lecture_asset["media_sources"]
            if not os.path.exists(lecture_working_dir):
                os.mkdir(lecture_working_dir)
            if not os.path.isfile(lecture_path):
                mpd_url = next((x["src"] for x in media_sources
                                if x["type"] == "application/dash+xml"), None)
                if not mpd_url:
                    print(
                        "> Couldn't find dash url for lecture '%s', skipping...",
                        lecture_title)
                    return
                media_info = manifest_parser(mpd_url)
                handle_irregular_segments(media_info, lecture_title,
                                          lecture_working_dir, lecture_path)
                cleanup(lecture_working_dir)
            else:
                print("> Lecture '%s' is already downloaded, skipping..." %
                      lecture_title)

    # process assets
    if dl_assets:
        assets = []
        all_assets = lecture["supplementary_assets"]
        for asset in all_assets:
            if asset["asset_type"] == "File":
                assets.append(asset)
                asset_filename = asset["filename"]
                download_url = next((x["file"]
                                     for x in asset["download_urls"]["File"]
                                     if x["label"] == "download"), None)
                if download_url:
                    try:
                        download(download_url,
                                 f"%s\\%s" % (lecture_dir, asset_filename),
                                 asset_filename)
                    except Exception as e:
                        print(
                            f"> Error downloading lecture asset: {e}. Skipping"
                        )
                        continue
            elif asset["asset_type"] == "Article":
                assets.append(asset)
                asset_path = f"%s\\%s.html" % (lecture_dir,
                                               sanitize(lecture_title))
                with open(asset_path, 'w') as f:
                    f.write(asset["body"])
            elif asset["asset_type"] == "ExternalLink":
                assets.append(asset)
                asset_path = f"%s\\%s. External URLs.txt" % (lecture_dir,
                                                             lecture_index)
                with open(asset_path, 'a') as f:
                    f.write(f"%s : %s\n" %
                            (asset["title"], asset["external_url"]))
        print("> Found %s assets for lecture '%s'" %
              (len(assets), lecture_title))

    # process captions
    if dl_captions:
        captions = []
        for caption in lecture_asset.get("captions"):
            if not isinstance(caption, dict):
                continue
            if caption.get("_class") != "caption":
                continue
            download_url = caption.get("url")
            if not download_url or not isinstance(download_url, str):
                continue
            lang = (caption.get("language") or caption.get("srclang")
                    or caption.get("label")
                    or caption.get("locale_id").split("_")[0])
            ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt"
            if caption_locale == "all" or caption_locale == lang:
                captions.append({
                    "language": lang,
                    "locale_id": caption.get("locale_id"),
                    "ext": ext,
                    "url": download_url
                })

        for caption in captions:
            process_caption(caption, lecture_index, lecture_title, lecture_dir)
def process_download_video(url, path, type):
    while url == "":
        print("url cant be empty")
        url = input("*Paste the link here:")
    print("Downloading...")

    if "mp4" in type or type == "":
        if path == "":
            video = YouTube(url)
            name = video.title
            name = sanitize(name)
            lst = get_drives()
            drive = lst[len(lst) - 1]
            downloaded_video = video.streams.get_highest_resolution().download(
            )
            file = r"" + drive + ":\\" + name + ".mp4"
            shutil.move(downloaded_video, file)

            return "Video downloaded: " + file
        else:
            try:
                video = YouTube(url)
                name = video.title
                name = sanitize(name)
                lst = get_drives()
                downloaded_video = video.streams.get_highest_resolution(
                ).download()
                file = r"" + path + "\\" + name + ".mp4"
                shutil.move(downloaded_video, file)

                return "Video downloaded: " + file
            except:
                print("invalid folder name")
                return

    else:
        if path == "":
            video = YouTube(url)
            name = video.title
            name = sanitize(name)
            lst = get_drives()
            drive = lst[len(lst) - 1]
            downloaded_video = video.streams.get_highest_resolution().download(
            )
            clip = VideoFileClip(downloaded_video)
            mp3 = downloaded_video.split(".mp4", 1)[0] + f".{type}"
            audio_clip = clip.audio
            audio_clip.write_audiofile(mp3, verbose=False, logger=None)
            audio_clip.close()
            clip.close()
            os.remove(downloaded_video)
            file = r"" + drive + ":\\" + name + f".{type}"
            shutil.move(mp3, file)

            return "Audio downloaded: " + file
        else:
            try:
                video = YouTube(url)
                name = video.title
                name = sanitize(name)
                lst = get_drives()
                downloaded_video = video.streams.get_highest_resolution(
                ).download()
                clip = VideoFileClip(downloaded_video)
                mp3 = downloaded_video.split(".mp4", 1)[0] + f".{type}"
                audio_clip = clip.audio
                audio_clip.write_audiofile(mp3, verbose=False, logger=None)
                audio_clip.close()
                clip.close()
                os.remove(downloaded_video)
                file = r"" + path + "\\" + name + f".{type}"
                shutil.move(mp3, file)

                return "Audio downloaded: " + file
            except:
                print("invalid folder name")
                return