Exemple #1
0
 def test_br_tags_converted(self):
     html = "This text<br>has breaks<br>in it"
     text = "This text  \nhas breaks  \nin it"
     self.assertEqual(markdownify(html), text)
     html = "<p>A paragraph with a<br>linebreak</p>"
     text = "A paragraph with a  \nlinebreak"
     self.assertEqual(markdownify(html), text)
Exemple #2
0
 def test_multiple_paragraphs(self):
     html = "<p>This text is in a paragraph</p><p>And so is this text</p>"
     text = "This text is in a paragraph\n\nAnd so is this text"
     self.assertEqual(markdownify(html), text)
     html = "<p>This paragraph has\nnewlines in it</p><p>This doesn't</p>"
     text = "This paragraph has newlines in it\n\nThis doesn't"
     self.assertEqual(markdownify(html), text)
def add_csv(hd,data,distnum,date):
	tmp_lst=[]
	if (hd.startswith('District')):
		tmp_lst.append('District')
		tmp_lst.append(distnum)
		tmp_lst.append('')
		tmp_lst.append('')
		md_data = markdownify.markdownify(data)
		tmp_lst.append(md_data.strip())
	elif (hd.startswith('Division')):
		tmp_lst.append('Division')
		tmp_lst.append(distnum)
		tmp_lst.append(hd[-1:])
		tmp_lst.append('')
		md_data = markdownify.markdownify(data)
		tmp_lst.append(md_data.strip())
	elif (hd.startswith('Area')):
		tmp_lst.append('Area')
		tmp_lst.append(distnum)
		tmp_lst.append(hd[-2:-1])
		tmp_lst.append(hd[-1:])
		md_data = markdownify.markdownify(data)
		tmp_lst.append(md_data.strip())
	tmp_lst.append(date)
	tmp_lst.append(source_link)
	with open(csv_name,'a') as op_file:
		writer = csv.writer(op_file)
		writer.writerow(tmp_lst)
    def to_markdown(self):
        """ Converts the element to a Markdown string. """
        md = [markdownify(self.title_html[0].decode())]
        for c in self.html_content.find("span").contents:
            md.append(markdownify(c.decode()))

        return "".join(md)
Exemple #5
0
def to_table(row):
    a = ""
    for x in row:
        x = list(x.values())
        first = markdownify(to_camel_case(x[0])).replace('|', '').replace("\n", '<br>')
        second = markdownify(x[1]).replace('|', '').replace("\n", '<br>')
        third = markdownify(x[2]).replace('|', '').replace("\n", '<br>')
        a += f"|{first}|{second}|{third}\n"
    return a
Exemple #6
0
 def test_bolding(self):
     html = "This text contains <strong>bolding</strong>."
     text = "This text contains **bolding**."
     self.assertEqual(markdownify(html), text)
     html = "<strong>bolding</strong> and <strong>more bolding</strong>"
     text = "**bolding** and **more bolding**"
     self.assertEqual(markdownify(html), text)
     html = "<p>Some <strong>bolded text</strong></p><p>In a paragraph</p>"
     text = "Some **bolded text**\n\nIn a paragraph"
     self.assertEqual(markdownify(html), text)
Exemple #7
0
 def test_hyperlinks(self):
     html = '<a href="https://www.pythonmorsels.com">Python Morsels</a>'
     text = "[Python Morsels](https://www.pythonmorsels.com)"
     self.assertEqual(markdownify(html), text)
     html = ('link 1 <a href="http://trey.io">here</a> and '
             'link 2 <a href="http://pypi.io">there</a>!')
     text = (
         'link 1 [here](http://trey.io) and link 2 [there](http://pypi.io)!'
     )
     self.assertEqual(markdownify(html), text)
Exemple #8
0
    def __sanitizeEvent(self, event: Dict, resource: Dict[str, str],
                        hasRca: bool) -> Dict:
        """Format the RH event data to filter and flatten the structure, and convert any HTML content into markdown format.

        Args:
            event (Dict): RH event data.
            resource (Dict[str, str]): Azure resource config data.
            hasRca (bool): True if the event contains Rca, else False.

        Returns:
            Dict: Sanitized RH event data.
        """
        recommendedActions = None
        summary = None

        sanitizedEvent = {}

        if RECOMMENDED_ACTIONS_CONTENT in event[PROPERTIES]:
            self.tracer.info("[%s] event with id=%s has RCA." %
                             (self.fullName, event[ID]))

            # recommendedActionContents is in HTML format. Convert it into markdown.
            recommendedActions = markdownify(
                event[PROPERTIES][RECOMMENDED_ACTIONS_CONTENT])

            # summary is in HTML format if RCA is present.
            summary = markdownify(event[PROPERTIES][SUMMARY])
        elif RECOMMENDED_ACTIONS in event[PROPERTIES]:
            self.tracer.info("[%s] event with id=%s does not have RCA." %
                             (self.fullName, event[ID]))
            # recommendedActions is a list of actions in text format. Convert it into HTML format and then convert to markdown.
            htmlFormattedRecommendedActions = self.__formatToHtml(
                event[PROPERTIES][RECOMMENDED_ACTIONS])
            recommendedActions = markdownify(htmlFormattedRecommendedActions)
            summary = event[PROPERTIES][SUMMARY]

        # Add resource related data
        sanitizedEvent[AZ_RESOURCE_ID] = resource[AZ_RESOURCE_ID]
        sanitizedEvent[SID] = resource[SID]
        sanitizedEvent[ARM_TYPE] = resource[ARM_TYPE]

        # Add RH related data.
        sanitizedEvent[ID] = event[ID]
        sanitizedEvent[NAME] = event[NAME]
        sanitizedEvent[RH_TYPE] = event[TYPE]
        parsedProperties = self.__populateProperties(event, summary,
                                                     recommendedActions,
                                                     hasRca)
        sanitizedEvent = {**sanitizedEvent, **parsedProperties}

        return sanitizedEvent
 def markdown():
     filename = fileText.get()
     filepath = os.path.join(cwd + '\\AlHtmlToMarkdown', filename)
     if os.path.exists(filepath):
         extension = os.path.splitext(filepath)[1]
         try:
             if extension.lower() == ".html":
                 htmlFile = open(filepath, "r")
                 html = htmlFile.read()
                 htmlFile.close()
                 markDown = markdownify.markdownify(html,
                                                    heading_style="ATX")
                 markdownFileName = filename.replace(extension, '.md')
                 markdownFilePath = os.path.join(cwd + '\\AlHtmlToMarkd'
                                                 'own\\Markdown',
                                                 markdownFileName)
                 markdownFile = open(markdownFilePath, "w")
                 markdownFile.writelines(markDown)
                 markdownFile.close()
                 text.delete(1.0, END)
                 text.insert(1.0, markdownFileName + ' has been saved '
                             'successfully in Markdown folder')
         except Exception as e:
             text.delete(1.0, END)
             print(str(e))
             text.insert(1.0, 'Invalid document, please provide .html '
                         'extension files')
     else:
         text.delete(1.0, END)
         text.insert(1.0, 'Invalid file path')
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(description='Fun with phpbb database')
    parser.add_argument('dbfile',
                        type=str,
                        help='phpBB database dump (sqlite3)')

    args = parser.parse_args()

    if not os.path.exists(args.dbfile):
        print('Failed to open file \'{}\''.format(args.dbfile))
        sys.exit(1)

    connection = sqlite3.connect(args.dbfile)

    forums = get_forums(connection)
    topics = get_topics(connection)
    attachments = get_attachments(connection)
    posts = get_posts(connection)

    forum_paths = create_forums_folders(forums)

    for post in posts:
        post_id = post['post_id']
        topic_id = post['topic_id']
        post_username = post['post_username']
        post_text = post['post_text']
        got_attachment = post['got_attachment']
        post_time = datetime.datetime.fromtimestamp(
            post['post_time']).strftime('%Y-%m-%d %H:%M:%S')

        topic = topics[topic_id]
        topic_name = topic['name']
        forum_id = topic['forum_id']

        base_path = forum_paths[forum_id]

        post_filepath = os.path.join(
            base_path, '{id}_{name}.md'.format(id=topic_id,
                                               name=slugify(topic_name)))

        with io.open(post_filepath, 'a') as f:
            if f.tell() == 0:
                # Write header
                f.write('# {}\n\n'.format(topic_name))
            else:
                f.write('\n\n')

            f.write('## {}, posted by: {}\n\n'.format(post_time,
                                                      post_username))
            f.write(markdownify(post_text))

            if got_attachment:
                attachment_list = [
                    a['real_filename'] for (i, a) in attachments.items()
                    if a['post_id'] == post_id
                ]
                f.write('\n\n### Attachments\n\n')
                for a in attachment_list:
                    f.write(
                        '[{attachment}]({attachment})'.format(attachment=a))
 def _clean(self, value):
     if value:
         value = HTML_CLEANER.clean_html(value)
         value = markdownify(value)
         return value
     else:
         return ""
Exemple #12
0
 def render_body(self):
     """
     Renders standard template with context
     """
     if self.body_template is not None:
         body = Template(self.body_template).render(self.get_context())
     elif self.template_name is not None:
         body = loader.get_template(self.template_name).render(self.get_context())
     else:
         body = None
         try:
             from markdownify import markdownify
         except ImportError:
             pass
         else:
             html_body = self.render_html_body()
             if html_body is not None:
                 body = markdownify(html_body, convert=ALLOWED_TAGS)
         if body is None:
             raise MissingBody('The email does not have a body. Either'
                               ' provide a body or template_name or, if you'
                               ' really want to send an email without a'
                               ' body, set the body to an empty string'
                               ' explicitly.')
     return body
Exemple #13
0
    def __init__(self, Tree):
        self.Tree = Tree

        self.TextElements = {
            float(TextElement.attrib["start"]): {
                "start":
                float(TextElement.attrib["start"]),
                "duration":
                float(TextElement.attrib["dur"]),
                "end":
                round(
                    float(TextElement.attrib["start"]) +
                    float(TextElement.attrib["dur"]),
                    2,
                ),
                "text":
                TextElement.text,
                "markdown":
                markdownify(TextElement.text).strip(),
            }
            for TextElement in self.Tree.findall("text")
        }

        self.duration = sorted([
            TextElement["end"] for TextElement in self.TextElements.values()
        ])[-1]

        self.time = 0.0
        self.current = None
Exemple #14
0
def cleanup_html(cell: str) -> Optional[str]:
    if cell is not None:
        cleaned: str = cell.replace("<p> </p>", "").replace(" ", " ")
        markdown: str = markdownify(cleaned).strip()

        # Example Formatting Locator Strings:
        # The Things They Carry Video Series
        # The FBI is offering a reward for information leading to the arrest of Juan Carlos Martinez

        # Seen HTML Tags: ['br', 'div', 'p', 'a', 'li', 'ol', 'ul', 'strong']

        # This cleans newlines up and only allows a max of 1 blank line in between. Not exhaustingly tested
        cleaned_lines: List[str] = []
        counter: int = 0
        for line in markdown.splitlines():
            if line.strip():
                # Not An Empty Line
                counter: int = 0
                cleaned_lines.append(line.strip())
            else:
                # Empty Line
                if counter == 2:
                    cleaned_lines.append("")

                counter: int = counter + 1

        # Not entirely sure how one random instance of <p> still escapes cleaning.
        return "\n".join(cleaned_lines).replace("<p>", "")

    return None
Exemple #15
0
 def render_body(self):
     """
     Renders standard template with context
     """
     if self.body_template is not None:
         body = Template(self.body_template).render(self.get_context())
     elif self.template_name is not None:
         body = loader.get_template(self.template_name).render(
             self.get_context_data())
     else:
         try:
             body = self.body
         except AttributeError:
             body = None
         try:
             from markdownify import markdownify
         except ImportError:
             pass
         else:
             html_body = self.render_html_body()
             if html_body is not None:
                 body = markdownify(html_body, convert=ALLOWED_TAGS)
         if body is None:
             raise MissingBody('The email does not have a body. Either'
                               ' provide a body or template_name or, if you'
                               ' really want to send an email without a'
                               ' body, set the body to an empty string'
                               ' explicitly.')
     return body
def soup_to_md(soup):
    # convert top-level elements individually to remove leading space
    top_elements = []
    for element in soup.contents:
        if element.name in COPY_TAGS:
            top_elements.append(to_pretty_html(element))
        else:
            # convert HTML to markdown
            markdown = markdownify(str(element), heading_style='ATX', bullets='*').strip()
            # remove blank lines
            markdown = '\n'.join(line.rstrip() for line in markdown.splitlines() if line.strip())
            top_elements.append(markdown.strip())
    # combine into single string
    markdown = '\n\n'.join(top_elements)
    # remove trailing whitespace
    markdown = '\n'.join(line.rstrip() for line in markdown.splitlines())
    # remove consecutive blank lines
    markdown = re.sub('\n\n\n+', '\n\n', markdown)
    # convert tabs to spaces
    markdown = markdown.replace('\t', '    ')
    # remove some Unicode characters
    for codepoint in DELETE_UNICODE:
        markdown = markdown.replace(chr(codepoint), '')
    # convert the other Unicode characters
    for char in set(re.findall('[^ -~\n]', markdown)):
        markdown = markdown.replace(char, f'&#{ord(char)};')
    return markdown.strip()
Exemple #17
0
    def __init__(self, Tree):
        self.Tree = Tree

        self.TextElements = {
            float(TextElement.attrib['start']): {
                'start':
                float(TextElement.attrib['start']),
                'duration':
                float(TextElement.attrib['dur']),
                'end':
                round(
                    float(TextElement.attrib['start']) +
                    float(TextElement.attrib['dur']), 2),
                'text':
                TextElement.text,
                'markdown':
                markdownify(TextElement.text)
            }
            for TextElement in self.Tree.findall('text')
        }

        self.duration = sorted([
            TextElement['end'] for TextElement in self.TextElements.values()
        ])[-1]

        self.time = 0.0
        self.current = None
Exemple #18
0
def load_assignment_details(assignment_ids, api_url, api_key, current_course):
    if assignment_ids is None:
        raise PreventUpdate
    elif assignment_ids == []:
        return ([],'','','','','','','','','','',[],[],[],[])
    course = get_course(api_url, api_key, current_course)
    assignments = course.assignments
    assignment_groups = course.assignment_groups

    # TODO: support multiple assignments
    assignments = [x for x in assignments if x.id in assignment_ids]
    assignment = assignments[0]
    details = assignment.description
    if len(assignments) > 1:
        assignment_groups = []
    else:
        pass
    
    return (
        [{'label': x.name, 'value': x.id} for x in assignment_groups], # assignment_groups options
        assignment.assignment_group_id,
        assignment.id,
        assignment.created_at,
        assignment.updated_at,
        assignment.position,
        sum([getattr(assignment,'needs_grading_count',0)]),
        assignment.name,
        assignment.due_at,
        assignment.points_possible,
        markdownify(details or '_No description set in Canvas_'),
        details,
        [{'label': 'None', 'value': 'None'}],
        assignment.grading_type,
        [x for x in assignment.submission_types],
    )
    def map(cls, breach: JSON) -> JSON:
        indicator: JSON = cls.DEFAULTS.copy()

        indicator['id'] = transient_id(indicator, breach["Name"])

        # `BreachDate` itself is just a date with no time (i.e. YYYY-MM-DD),
        # so make sure to add some time to make the date comply with ISO 8601.
        indicator['valid_time'] = {
            'start_time': breach['BreachDate'] + 'T00:00:00Z'
        }

        indicator['confidence'] = ['Medium', 'High'][breach['IsVerified']]

        # `Description` contains an overview of the breach represented in HTML,
        # so convert its contents to Markdown to make it comply with CTIM.
        indicator['description'] = markdownify(breach['Description'])

        indicator['severity'] = ['Medium', 'High'
                                 ][breach['IsVerified']
                                   and 'Passwords' in breach['DataClasses']]

        indicator['short_description'] = breach['Title']

        indicator['tags'] = breach['DataClasses']

        indicator['title'] = breach['Name']

        return indicator
Exemple #20
0
    async def get_page_contents(self) -> Union[str, Embed]:
        title = self.pages[self.page_index]

        session = ClientSession(timeout=ClientTimeout(20))

        params = {"action": "query",
                  "prop": "extracts",
                  "titles": title,
                  "format": "json",
                  "exintro": "true",
                  "explainttext": "true"}

        try:
            response = await session.get("https://ru.wikipedia.org/w/api.php",
                                         params=params)
        except (asyncio.TimeoutError, ClientConnectionError):
            return styled_embed_generator.get_embed(Style.ERROR, self.tr.translate("search_connection_error"))

        data = await response.json()

        await session.close()
        response.close()

        pages = data["query"]["pages"]
        text = data["query"]["pages"][next(iter(pages))]["extract"]
        text = markdownify(text, strip=["img"])

        embed = styled_embed_generator.get_embed(Style.INFO, text, title=title, author=self.original_author,
                                                 guild=self.guild)

        return embed
Exemple #21
0
    def __init__(self, Tree: ElementTree):
        super().__init__()

        self.Tree = Tree

        self.TextElements = {
            float(TextElement.attrib["start"]): {
                "start":
                float(TextElement.attrib["start"]),
                "duration":
                float(TextElement.attrib["dur"]),
                "end":
                round(
                    float(TextElement.attrib["start"]) +
                    float(TextElement.attrib["dur"]),
                    2,
                ),
                "text":
                TextElement.text,
                "markdown":
                markdownify(TextElement.text),
            }
            for TextElement in self.Tree.findall("text")
        }

        self.duration = sorted([
            TextElement["end"] for TextElement in self.TextElements.values()
        ])[-1]
Exemple #22
0
    def __init__(self, Data: str):
        super().__init__()

        self.Tree = ElementTree.fromstring(Data)

        self.TextElements = {
            float(TextElement.attrib["start"]): {
                "start":
                float(TextElement.attrib["start"]),
                "duration":
                float(TextElement.attrib["dur"]),
                "end":
                round(
                    float(TextElement.attrib["start"]) +
                    float(TextElement.attrib["dur"]),
                    2,
                ),
                "text":
                TextElement.text,
                "markdown":
                markdownify(TextElement.text),
            }
            for TextElement in self.Tree.findall("text")
            if "dur" in TextElement.attrib
        }

        self.duration = sorted(
            map(lambda TextElement: TextElement["end"],
                self.TextElements.values()))[-1]
Exemple #23
0
async def get_diary_embed(dids):
    description = ''
    for did in dids:
        d_entry = await api.api_call(path=f'log-entry/{did}')
        film = d_entry['film']
        description += f"**[{film['name']} ({film['releaseYear']})]"
        description += f'({get_link(d_entry)})**\n'
        if 'diaryDetails' in d_entry:
            description += f"**{d_entry['diaryDetails']['diaryDate']}** "
        if 'rating' in d_entry:
            description += ' ' + int(d_entry['rating']) * '★'
            if str(d_entry['rating'])[-1] == '5':
                description += '½ '
        if d_entry['like']:
            description += ' <3'
        if d_entry['diaryDetails']['rewatch']:
            description += ' ↺'
        if 'review' in d_entry:
            if d_entry['review']['containsSpoilers']:
                description += '\n```Contains spoilers```'
            else:
                description += '\n```'
                +markdownify(d_entry['review']['text'][:1600]) + '```'
        description += '\n'
    embed = discord.Embed(description=description)
    if 'poster' in film:
        embed.set_thumbnail(url=film['poster']['sizes'][-1]['url'])

    return embed
Exemple #24
0
def hyper_markdownify(s):
    while True:
        try:
            s = json.loads(s)
        except (json.decoder.JSONDecodeError, TypeError):
            break

    return markdownify(s)
Exemple #25
0
    def dump(self, post: Post) -> None:
        metadata = json.loads(post.json(exclude={"canonical_url", "filepath"}))

        content = markdownify(metadata.pop("content"))

        frontmatter_post = frontmatter.Post(content, **metadata)

        frontmatter.dump(frontmatter_post, post.filepath, encoding="utf-8")
Exemple #26
0
def get_summary(entry: feedparser.FeedParserDict) -> str:
    try:
        summary = entry.summary
        if entry.summary_detail.type in ('text/html', 'text/xml'):
            summary = remove_html_tags(markdownify(summary))
        return summary
    except AttributeError:
        return ''
Exemple #27
0
def write_to_file(article, header):
    file_path = _build_path(article.title)
    markdown = markdownify.markdownify(article.html, heading_style='ATX')
    with open(file_path, 'w') as f:
        f.write(header)
        f.write('------')
        f.write('\n')
    with open(file_path, 'a') as f:
        f.write(markdown)
Exemple #28
0
def extract_data(html):
    soup = BeautifulSoup(html, 'lxml')
    title = soup.find('h1', attrs={'data-qa': "vacancy-title"}).text
    body = soup.find('div', attrs={'data-qa': "vacancy-description"})
    comp = soup.find('a', class_="vacancy-company-name")
    if comp:
        company_name = comp.text.replace(u'\xa0', u' ')
        comp = company_data(comp['href'], company_name)
    return {'title': title, 'company': comp, 'body': markdownify(str(body))}
def main():
    file = open(SOURCE_PATH)
    source = file.read()
    file.close()

    target = markdownify.markdownify(source)
    file = open(TARGET_PATH, "w")
    file.write(target)
    file.close()
Exemple #30
0
	def request_details(cls, request):
		ticker = request.get("ticker")

		try:
			assetData = CoinGecko.connection.get_coin_by_id(id=ticker.get("symbol"), localization="false", tickers=False, market_data=True, community_data=True, developer_data=True)
			historicData = CoinGecko.connection.get_coin_ohlc_by_id(id=ticker.get("symbol"), vs_currency="usd", days=365)
		except:
			return [{}, ""]

		description = markdownify(assetData["description"].get("en", "No description"))
		descriptionParagraphs = description.split("\r\n\r\n")
		textLength = [len(descriptionParagraphs[0])]
		for i in range(1, len(descriptionParagraphs)):
			nextLength = textLength[-1] + len(descriptionParagraphs[i])
			if nextLength > 500 and textLength[-1] > 300 or nextLength > 1900: break
			textLength.append(nextLength)
		description = "\n".join(descriptionParagraphs[:len(textLength)])[:] + "\n[Read more on CoinGecko](https://www.coingecko.com/coins/{})".format(ticker.get("symbol"))

		highs = [e[2] for e in historicData]
		lows = [e[3] for e in historicData]

		payload = {
			"name": "{} ({})".format(assetData["name"], ticker.get("base")),
			"description": description,
			"rank": assetData["market_data"]["market_cap_rank"],
			"supply": {},
			"score": {
				"developer": assetData["developer_score"],
				"community": assetData["community_score"],
				"liquidity": assetData["liquidity_score"],
				"public interest": assetData["public_interest_score"]
			},
			"price": {
				"current": assetData["market_data"]["current_price"].get("usd"),
				"ath": assetData["market_data"]["ath"].get("usd"),
				"atl": assetData["market_data"]["atl"].get("usd")
			},
			"change": {
				"past day": assetData["market_data"]["price_change_percentage_24h_in_currency"].get("usd"),
				"past month": assetData["market_data"]["price_change_percentage_30d_in_currency"].get("usd"),
				"past year": assetData["market_data"]["price_change_percentage_1y_in_currency"].get("usd")
			},
			"sourceText": "Data from CoinGecko",
			"platform": "CoinGecko",
		}

		if assetData["image"]["large"].startswith("http"): payload["image"] = assetData["image"]["large"]
		if assetData["links"]["homepage"][0] != "": payload["url"] = assetData["links"]["homepage"][0].replace(" ", "") if assetData["links"]["homepage"][0].replace(" ", "").startswith("http") else "https://" + assetData["links"]["homepage"][0].replace(" ", "")
		if assetData["market_data"]["total_volume"] is not None: payload["volume"] = assetData["market_data"]["total_volume"].get("usd")
		if assetData["market_data"]["market_cap"] is not None: payload["marketcap"] = assetData["market_data"]["market_cap"].get("usd")
		if assetData["market_data"]["total_supply"] is not None: payload["supply"]["total"] = assetData["market_data"]["total_supply"]
		if assetData["market_data"]["circulating_supply"] is not None: payload["supply"]["circulating"] = assetData["market_data"]["circulating_supply"]
		if len(highs) != 0: payload["price"]["1y high"] = max(highs)
		if len(lows) != 0: payload["price"]["1y low"] = min(lows)

		return [payload, ""]
Exemple #31
0
def main(argv):
    input_dir = argv[0]
    output_dir = argv[1]
    try:
        windows_encoding = argv[2] is not None
    except IndexError:
        windows_encoding = False

    print("Input directory: " + input_dir)
    print("Output directory: " + output_dir)

    copy_files(input_dir, output_dir)
    print("Copied files to output directory.")

    files = get_html_files(output_dir)

    try:
        for f in files:
            print("Editing file: " + os.path.basename(f))
            with open(f, 'r', encoding='utf-8') as original:
                filedata = original.read()
                font_matter: str = re.search(pattern='---[\s\S]*---', string=filedata)[0]
                filedata = re.sub(pattern='---[\s\S]*---', repl="", string=filedata)

                soup = BeautifulSoup(filedata, "html.parser")
                author_name = getattr(soup.find(class_="profile-usercard-hover"), 'text', '')
                author_name = re.sub(pattern="\([a-zA-Z\s]*\)", repl="", string=author_name)
                author_name = "author_name: "+author_name

                title = os.path.basename(f).replace(".html", "").replace(".md", "")
                title = re.sub(pattern="\d*-\d*-\d*-", repl="", string=title)
                title = "title: "+"\""+title+"\""

                font_matter = font_matter[0:3]+"\n"+\
                            title+"\n"+\
                            author_name+\
                            font_matter[3:len(font_matter)]+\
                            "\n"
            with open(f, 'w', encoding='utf-8') as modified:
                converted_article = font_matter+markdownify(filedata, bullets='-', header='ATX')
                modified.write(converted_article)
            newname = f.replace('.html', '.md')
            output = os.rename(f, newname)
            
    except Exception as e:
        print(e)
        print('Batch process failed. Deleting the contents of the output directory.')
        for filename in os.listdir(output_dir):
            filepath = os.path.join(output_dir, filename)
            try:
                shutil.rmtree(filepath)
            except OSError:
                os.remove(filepath)

    print('Done.')
    exit()
Exemple #32
0
    def get_edit(self, id):
        """ Get route for Editing an Item
            TODO: Make decorator for "owership" of item
        """
        item = Item.query.get_or_404(id)
        if current_user.id == item.user_id:
            commentForm = self._commentForm(request)
            commentForm.text.data = markdownify(item.text)
            commentForm.edit.data = True

            return render_template('item/item.html',
                item = item, form = commentForm, title=item.title, edit=True)
        else:
            return redirect(url_for('.item', id=id))
def process(url, start=0, fetch=50):
    """ Main processing engine """

    pos = start

    # End will be updated during each request with incoming data
    end = pos + fetch

    Console.header("Tumblr Import")
    Console.info("Importing data...")
    Console.indent()

    while pos < end:
        Console.info("Requesting %s-%s of %s" % (pos, pos+fetch-1, end))

        response = requests.get(url % (pos, fetch))

        if response.status_code != 200:
            raise Exception("Error during communication with Tumblr: %s" % r.status)

        tree = ElementTree.fromstring(response.content)

        # This element contains all posts
        allPosts = tree.find("posts")

        # Update end pointer
        end = int(allPosts.get("total"))

        # Iterate trough all posts
        for post in allPosts:
            postType = post.get("type")
            postTimeStamp = post.get("unix-timestamp")
            postExportDate = str(datetime.datetime.fromtimestamp(int(postTimeStamp)))

            postSlug = post.get("slug")
            postFormat = post.get("format")
            postDateOnly = postExportDate[0:postExportDate.find(" ")]
            postFileName = "%s-%s" % (postDateOnly, postSlug)

            if postType == "quote":
                quoteText = post.find("quote-text").text
                quoteComment = post.find("quote-source").text

                # Post-process
                quoteText = markdownify.markdownify("<blockquote>" + quoteText + "</blockquote>").rstrip("\n").lstrip("\n")
                quoteComment = markdownify.markdownify(quoteComment).rstrip("\n")

                fileContent = quoteTemplate % (postSlug, postExportDate, quoteText + "\n\n" + quoteComment)

            elif postType == "photo":
                photoText = post.find("photo-caption").text
                try:
                    photoLinkUrl = post.find("photo-link-url").text
                except:
                    photoLinkUrl = None
                photoUrl = post.find("photo-url").text

                # Post-process
                photoText = markdownify.markdownify(photoText).rstrip("\n")

                # Downloading image
                photoResponse = requests.get(photoUrl, allow_redirects=True)
                if photoResponse.status_code != 200:
                    Console.error("Unable to load photo. Status: %s; URL: %s", photoResponse.status_code, photoUrl)
                    continue

                # Build extension based on response headers (safer than using file extension)
                photoType = photoResponse.headers["content-type"]

                if "png" in photoType:
                    photoExtension = ".png"
                elif "jpeg" in photoType or "jpg" in photoType:
                    photoExtension = ".jpeg"
                elif "gif" in photoType:
                    photoExtension = ".gif"
                else:
                    Console.error("Unknown photo format: %s; Status: %s; URL: %s", photoType, photoResponse.status_code, photoUrl)
                    continue

                # Generating checksum
                photoHash = hashlib.sha1(photoResponse.content).hexdigest()

                # Generate file name and path from existing data
                photoFileName = "%s-%s-%s%s" % (postDateOnly, postSlug, photoHash[0:10], photoExtension)
                photoPath = os.path.join(photoFolder, photoFileName)

                # Do not repeatly write identical files
                if not os.path.exists(photoPath):
                    photoFile = open(photoPath, "wb")
                    photoFile.write(photoResponse.content)
                    photoFile.close()

                # Generate basic image tag
                photoAsset = '<img src="{{@asset.url %s/%s/%s}}" alt=""/>' % (projectName, photoAssetFolder, photoFileName)

                # Wrap with a link when it should be link to an external site
                if photoLinkUrl:
                    photoAsset = '<a href="%s">%s</a>' % (photoLinkUrl, photoAsset)

                fileContent = photoTemplate % (postSlug, postExportDate, photoAsset + "\n\n" + photoText)

            elif postType == "link":
                linkUrl = post.find("link-url").text
                try:
                    linkText = post.find("link-text").text
                except:
                    linkText = linkUrl

                # Post-process
                if linkText != linkUrl:
                    linkText = markdownify.markdownify(linkText).rstrip("\n")

                fileContent = linkTemplate % (postSlug, postExportDate, "[%s](%s)" % (linkText, linkUrl))

            elif postType == "video":
                videoCode = post.find("video-source").text
                videoText = post.find("video-caption").text

                # Post-process
                videoText = markdownify.markdownify(videoText).rstrip("\n")

                fileContent = videoTemplate % (postSlug, postExportDate, videoCode + "\n\n" + videoText)

            elif postType == "regular":
                postText = post.find("regular-body").text

                try:
                    postTitle = post.find("regular-title").text
                except:
                    # Ignore posts without title
                    Console.warn("Ignoring post without title!")
                    continue

                postText = markdownify.markdownify(postText).rstrip("\n")
                fileContent = regularTemplate % (postSlug, postExportDate, postTitle, postText)

            else:
                Console.warn("Unknown POST-TYPE: %s" % postType)
                print(ElementTree.dump(post))
                continue

            # Write post file
            fileHandle = open(os.path.join(postFolder, postDateOnly + "-" + postType + "-" + postSlug + ".markdown"), "w")
            fileHandle.write(fileContent)
            fileHandle.close()

        # Update for next requests
        pos = pos + fetch

    Console.outdent()

    Console.info("Successfully imported")
Exemple #34
0
#!/usr/bin/env python

# some comment as a example
from markdownify import markdownify

with open('example.html') as f:
    lines = ''.join(f.readlines())
    md = markdownify(lines)
    print md

class MyClass(object):
    def __init__(self):
       # TODO: write something!
       pass