Beispiel #1
0
def TestInput(data):
    fdp = atheris.FuzzedDataProvider(data)

    try:
        ftfy.fix_text(fdp.ConsumeString(1000))
        ftfy.fix_text(fdp.ConsumeUnicode(1000))

        plan1 = ftfy.fix_and_explain(fdp.ConsumeString(1000))[1]
        plan2 = ftfy.fix_and_explain(fdp.ConsumeUnicode(1000))[1]
        ftfy.apply_plan(fdp.ConsumeString(1000), plan1)
        ftfy.apply_plan(fdp.ConsumeString(1000), plan2)
        ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan1)
        ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan2)

        ftfy.fix_text_segment(fdp.ConsumeString(1000))
        ftfy.fix_text_segment(fdp.ConsumeUnicode(1000))

        f = open("temp.txt", "w")
        f.write(fdp.ConsumeString(1000))
        f.write(fdp.ConsumeUnicode(1000))
        f.close()
        f = open("temp.txt", "r")
        ftfy.fix_file(f)
        f.close()

        ftfy.guess_bytes(fdp.ConsumeBytes(1000))
    except UnicodeError as e:
        if "Hey wait, this isn't Unicode." not in str(e):
            raise e
Beispiel #2
0
def clean(datapath):
    """
    Fix encoding errors in a data file and
    gets rid of data which still seems problematic.
    """
    red_flags = ['â€', 'Â']

    with open(datapath, 'r') as file:
        data = json.load(file)

    bad = []
    good = []
    for article in progress(data, 'Fixing {0} articles...'.format(len(data))):
        for key in ['title', 'text']:
            article[key] = fix_text_segment(article[key])

        flagged = False
        for flag in red_flags:
            if flag in article['text'] + article['title']:
                bad.append(article)
                flagged = True
                break
        if not flagged:
            good.append(article)

    print('Getting rid of {0} bad articles.'.format(len(bad)))

    outpath = datapath.replace('.json', '_cleaned.json')
    with open(outpath, 'w') as file:
        json.dump(good, file)
Beispiel #3
0
 def format(self, data):
     super().format(data)
     rn = self._xml.find('.//items/.')
     if len(data):
         for item in data:
             items_rt = ET.XML('<item></item>')
             for el in item:
                 items_rt.append(el)
             rn.append(items_rt)
     for el in rn.iter():
         if el.text:
             el.text = ftfy.fix_text_segment(el.text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFKC')
     # remove non breaking space code
     for p in self._xml.itertext():
             re.sub(r'ğŸ', '', p)
     try:
         p = self._xml.findall('.//headers/..')
         for e in p:
             e.remove(e.find('./headers'))
     except AttributeError:
         pass
     try:
         p = self._xml.findall('.//paging/..')
         for e in p:
             e.remove(e.find('./paging'))
     except AttributeError:
         pass
     if self.__class__.__name__ is 'SaveFaceFormatterXML':
         if self._formatter_func is not None:
             self._xml = self._formatter_func(self._xml)
Beispiel #4
0
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    eq_(fix_text(example), '&\n<html>\n&amp;')
    eq_(fix_text_segment(example), '&amp;\n<html>\n&amp;')

    eq_(fix_text(example, fix_entities=True), '&\n<html>\n&')
    eq_(fix_text_segment(example, fix_entities=True), '&\n<html>\n&')

    eq_(fix_text(example, fix_entities=False), '&amp;\n<html>\n&amp;')
    eq_(fix_text_segment(example, fix_entities=False), '&amp;\n<html>\n&amp;')

    eq_(fix_text_segment('&lt;&gt;', fix_entities=False), '&lt;&gt;')
    eq_(fix_text_segment('&lt;&gt;', fix_entities=True), '<>')
    eq_(fix_text_segment('&lt;&gt;'), '<>')
    eq_(fix_text_segment('jednocze&sacute;nie'), 'jednocześnie')
    eq_(fix_text_segment('JEDNOCZE&Sacute;NIE'), 'JEDNOCZEŚNIE')
Beispiel #5
0
def load_metadata(soup):
    if not soup:
        return {}
    metas = soup.select("script[type='application/ld+json']")
    o = {"errors": []}
    for meta in metas:
        try:
            _o = json.loads(meta.string)

        except Exception as e:
            o["errors"].append([f"{e.__class__.__name__} :: {e}", meta.string])
            continue

        if isinstance(_o, list):
            for __o in _o:
                o[__o["@type"]] = __o
        elif "@type" in _o and isinstance(_o["@type"], list):
            for t in _o["@type"]:
                for __o in _o:
                    o[t] = __o
        elif "@type" in _o and _o["@type"] in o:
            o[_o["@type"]] = _o
        else:
            continue

    flat = FlatterDict(o)
    o = {}
    for k, v in flat.items():

        if (isinstance(v, str) and not re.search(r"(image|url)", k)
                and not re.match(r"\s*http", v)):
            txt = fix_text_segment(v)
            if txt and "<" in txt:
                txt = re.sub(r"<[^>]*>", "", txt)
            o[k] = txt
        else:
            o[k] = v
    flat = json.loads(
        json.dumps(
            {k: v
             for k, v in o.items()},
            indent=4,
            default=lambda x: dict(**x) if isinstance(x, FlatterDict) else x,
        ))
    print(flat)
    return flat
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    eq_(fix_text(example), '&\n<html>\n&amp;')
    eq_(fix_text_segment(example), '&amp;\n<html>\n&amp;')

    eq_(fix_text(example, fix_entities=True), '&\n<html>\n&')
    eq_(fix_text_segment(example, fix_entities=True), '&\n<html>\n&')

    eq_(fix_text(example, fix_entities=False), '&amp;\n<html>\n&amp;')
    eq_(fix_text_segment(example, fix_entities=False), '&amp;\n<html>\n&amp;')

    eq_(fix_text_segment('&lt;&gt;', fix_entities=False), '&lt;&gt;')
    eq_(fix_text_segment('&lt;&gt;', fix_entities=True), '<>')
    eq_(fix_text_segment('&lt;&gt;'), '<>')
Beispiel #7
0
    def __init__(self,
                 url,
                 html,
                 row,
                 soup=None,
                 lxml=None,
                 fix_encoding_errors=True):
        self.url = url
        self.sitemap_data = row
        self.html = (fix_text_segment(html.replace("\xa0", " "),
                                      uncurl_quotes=False)
                     if fix_encoding_errors else html)
        try:

            self.soup = soup if soup else BeautifulSoup(self.html)
        except Exception as e:
            raise ValueError(f"{e.__class__.__name__} :: {e}, {self.html}")
        self.meta = Haystack(html)
        #print(json.dumps(self.meta, indent=4))
        try:
            if isinstance(self.html, str):
                self.html = self.html.encode("utf-8")
            self.lxml = lxml if lxml else parse_html(self.html)
        except Exception as e:
            raise ValueError(f"{e.__class__.__name__} :: {e}, {self.html}")
        self.data = {
            "content": self.content,
            "url": self.url,
            "title": self.title,
            "published_at": self.published_at,
            "description": self.summary,
            "author": self.author,
            "image_url": self.image_url,
            "section": self.section,
            "publisher": self.publisher,
            "keywords": self.keywords,
            "metadata": {k: v
                         for k, v in self.meta.data.items()},
        }
        self.data.update(
            {k: row[k]
             for k in self.passthrough_attrs if row and k in row})
def fix(text, lang, chars_rep, chars_pattern, punct_rep, punct_pattern):
    global global_chars_lang
    global_chars_lang = chars_rep

    # htmlEntity=regex.compile(r'[&][[:space:]]*[#][[:space:]]*[0-9]{2,4}[[:space:]]*[;]?',regex.U)
    chars3Re = regex.compile("[\uE000-\uFFFF]")
    chars3Re2 = regex.compile("[\u2000-\u200F]")
    chars3Re3 = regex.compile("\u007F|[\u0080-\u00A0]")
    quotesRegex = regex.compile("(?P<start>[[:alpha:]])\'\'(?P<end>(s|S|t|T|m|M|d|D|re|RE|ll|LL|ve|VE|em|EM)\W)")
    collapse_spaced_entities = regex.compile('([&][ ]*[#][ ]*)([0-9]{2,6})([ ]*[;])')

    # Test encode: fix mojibake
    ftfy_fixed_text = " ".join([ftfy.fix_text_segment(word, uncurl_quotes=False, fix_latin_ligatures=False) for word in text.split()])
    # ftfy_fixed_text= ftfy.fix_text_segment(stripped_text, fix_entities=True,uncurl_quotes=False,fix_latin_ligatures=False)

    # nicely_encoded_text = htmlEntity.sub(html.unescape, nicely_encoded_text)
    nicely_encoded_text = html.unescape(ftfy_fixed_text)

    # First replacing all HTML entities
    # for substring in htmlEntity.findall(nicely_encoded_text):
    #    code=substring.replace(' ','')[2:].replace(';','')
    #    try:
    #        newChar=chr(int(code))
    #    except ValueError:
    #        newChar=code    
    #    if newChar != "\n":
    #        nicely_encoded_text = nicely_encoded_text.replace(substring,newChar)

    normalized_text = chars_pattern.sub(replace_chars, nicely_encoded_text)

    if lang.lower() != "ja":
        normalized_text = chars3Re.sub(replace_chars3, normalized_text)
    normalized_text = chars3Re2.sub(replace_chars3, normalized_text)
    normalized_text = chars3Re3.sub(replace_chars3, normalized_text)
    normalized_text = quotesRegex.sub("\g<start>\'\g<end>", normalized_text)
    normalized_text_with_normalized_punct = punct_pattern.sub(lambda m: punct_rep[re.escape(m.group(0))], normalized_text)

    collapsed_spaces = re.sub('\s+', ' ', normalized_text_with_normalized_punct)  # Collapse multiple spaces
    collapsed_entities = collapse_spaced_entities.sub("&#\\2;", collapsed_spaces)

    return collapsed_entities.strip(" \n")
Beispiel #9
0
def get_articles(feed, fn):
    """
    Parse the specified feed,
    gathering the latest new articles.

    If an article matches one that already exists,
    it is skipped.

    The minimum length of an entry is
    500 characters. Anything under will be ignored.

    This will silently skip articles for which the full text
    can't be retrieved (i.e. if it returns 404).

    Some feeds, for whatever reason, do not include a `published`
    date in their entry data. In which case, it is left as an
    empty string.

    Args:
        | feed (Feed)    -- the feed to fetch from.
        | fn (Callable)  -- function to use an article
    """
    # Fetch the feed data.
    data = feedparser.parse(feed.ext_url)

    # If the `bozo` value is anything
    # but 0, there was an error parsing (or connecting) to the feed.
    if data.bozo:
        # Some errors are ok.
        if not isinstance(
                data.bozo_exception,
                feedparser.CharacterEncodingOverride) and not isinstance(
                    data.bozo_exception, feedparser.NonXMLContentType):
            raise data.bozo_exception

    for entry in data.entries:

        # URL for this entry.
        url = entry['links'][0]['href']

        # Check for an existing Article.
        # If one exists, skip.
        if Article.query.filter_by(ext_url=url).count():
            continue

        # Complete HTML content for this entry.
        try:
            entry_data, html = extractor.extract_entry_data(url)
        except (error.HTTPError, error.URLError, ConnectionResetError,
                BadStatusLine) as e:
            if type(e) == error.URLError or e.code == 404:
                # Can't reach, skip.
                logger.exception(
                    'Error extracting data for url {0}'.format(url))
                continue
            else:
                # Just skip so things don't break!
                logger.exception(
                    'Error extracting data for url {0}'.format(url))
                continue

        if entry_data is None:
            continue

        full_text = entry_data.cleaned_text

        # Skip over entries that are too short.
        if len(full_text) < 400:
            continue

        url = entry_data.canonical_link or url
        published = parse(entry.get('published')) if entry.get(
            'published') else entry_data.publish_date
        updated = parse(
            entry.get('updated')) if entry.get('updated') else published
        title = entry.get('title', entry_data.title)

        # Secondary check for an existing Article,
        # by checking the title and source.
        existing = Article.query.filter_by(title=title).first()
        if existing and existing.source == feed.source:
            continue

        # Download and save the top article image.
        image_url = extractor.extract_image(entry_data, filename=hash(url))

        fn(
            Article(ext_url=url,
                    source=feed.source,
                    feed=feed,
                    html=html,
                    text=fix_text_segment(full_text),
                    authors=extractor.extract_authors(entry),
                    tags=extractor.extract_tags(entry,
                                                known_tags=entry_data.tags),
                    title=fix_text_segment(title),
                    created_at=published,
                    updated_at=updated,
                    image=image_url,
                    score=evaluator.score(url)))
Beispiel #10
0
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    eq_(fix_text(example), '&\n<html>\n&amp;')
    eq_(fix_text_segment(example), '&amp;\n<html>\n&amp;')

    eq_(fix_text(example, fix_entities=True), '&\n<html>\n&')
    eq_(fix_text_segment(example, fix_entities=True), '&\n<html>\n&')

    eq_(fix_text(example, fix_entities=False), '&amp;\n<html>\n&amp;')
    eq_(fix_text_segment(example, fix_entities=False), '&amp;\n<html>\n&amp;')

    eq_(fix_text_segment('&lt;&gt;', fix_entities=False), '&lt;&gt;')
    eq_(fix_text_segment('&lt;&gt;', fix_entities=True), '<>')
    eq_(fix_text_segment('&lt;&gt;'), '<>')
    eq_(fix_text_segment('jednocze&sacute;nie'), 'jednocześnie')
    eq_(fix_text_segment('JEDNOCZE&Sacute;NIE'), 'JEDNOCZEŚNIE')
    eq_(fix_text_segment('ellipsis&#133;', normalization='NFKC'), 'ellipsis...')
    eq_(fix_text_segment('ellipsis&#x85;', normalization='NFKC'), 'ellipsis...')
    eq_(fix_text_segment('broken&#x81;'), 'broken\x81')
    eq_(unescape_html('euro &#x80;'), 'euro €')
    eq_(unescape_html('not an entity &#20x6;'), 'not an entity &#20x6;')
Beispiel #11
0
def test_entities():
    example = '&amp;\n<html>\n&amp;'
    assert fix_text(example) == '&\n<html>\n&amp;'
    assert fix_text_segment(example) == '&amp;\n<html>\n&amp;'

    assert fix_text(example, fix_entities=True) == '&\n<html>\n&'
    assert fix_text_segment(example, fix_entities=True) == '&\n<html>\n&'

    assert fix_text(example, fix_entities=False) == '&amp;\n<html>\n&amp;'
    assert fix_text_segment(example,
                            fix_entities=False) == '&amp;\n<html>\n&amp;'

    assert fix_text_segment('&lt;&gt;', fix_entities=False) == '&lt;&gt;'
    assert fix_text_segment('&lt;&gt;', fix_entities=True) == '<>'
    assert fix_text_segment('&lt;&gt;') == '<>'
    assert fix_text_segment('jednocze&sacute;nie') == 'jednocześnie'
    assert fix_text_segment('JEDNOCZE&Sacute;NIE') == 'JEDNOCZEŚNIE'
    assert fix_text_segment('ellipsis&#133;',
                            normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('ellipsis&#x85;',
                            normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('broken&#x81;') == 'broken\x81'
    assert fix_text_segment('&amp;amp;amp;') == '&'
    assert unescape_html('euro &#x80;') == 'euro €'
    assert unescape_html('EURO &EURO;') == 'EURO €'
    assert unescape_html('not an entity &#20x6;') == 'not an entity &#20x6;'
    assert unescape_html('JEDNOCZE&SACUTE;NIE') == 'JEDNOCZEŚNIE'
    assert unescape_html('V&SCARON;ICHNI') == 'VŠICHNI'
    assert unescape_html('&#xffff;') == ''
    assert unescape_html('&#xffffffff;') == '\ufffd'
    assert (fix_text_segment('this is just informal english &not html') ==
            'this is just informal english &not html')
Beispiel #12
0
    def run(self, dumppath, use_patch):
        if use_patch:
            print('Patching out saving images to S3...')
            patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg')
            patcher.start()
        else:
            patcher = None

        print('Loading sources...')
        sources_map = {}
        with open(os.path.join(dumppath, 'sources.json'), 'r') as f:
            sources = json.load(f)
            for i, s in enumerate(sources):
                source = Source.query.filter(Source.name == s['name']).first()
                if not source:
                    source = Source(name=s['name'])
                    db.session.add(source)
                id = s['_id']['$oid']
                sources_map[id] = source

                progress_bar(i/(len(sources) - 1) * 100)

        db.session.commit()

        print('\nLoading feeds...')
        feeds_map = {}
        with open(os.path.join(dumppath, 'feeds.json'), 'r') as f:
            feeds = json.load(f)
            for i, f in enumerate(feeds):
                feed = Feed.query.filter(Feed.ext_url == f['ext_url']).first()
                if not feed:
                    feed = Feed(ext_url=f['ext_url'])
                    db.session.add(feed)
                feed.source = sources_map[f['source']['$oid']]

                id = f['_id']['$oid']
                feeds_map[id] = feed

                progress_bar(i/(len(feeds) - 1) * 100)

        db.session.commit()

        print('\nLoading articles...')
        with open(os.path.join(dumppath, 'articles.json'), 'r') as f:
            articles = json.load(f)
            for i, a in enumerate(articles):

                authors = []
                for author in a['authors']:
                    authors.append(Author.find_or_create(name=author))

                existing = Article.query.filter(Article.ext_url == a['ext_url']).first()

                if not existing:
                    feed = feeds_map[a['feed']['$oid']]
                    article = Article(
                        ext_url=a['ext_url'],
                        source=feed.source,
                        feed=feed,
                        html=None,    # not saved by argos.corpora
                        text=fix_text_segment(a['text']),
                        authors=authors,
                        tags=[],
                        title=fix_text_segment(a['title']),
                        created_at=datetime.fromtimestamp(a['created_at']['$date']/1000),
                        updated_at=datetime.fromtimestamp(a['updated_at']['$date']/1000),
                        image=a['image'],
                        score=evaluator.score(a['ext_url'])
                    )
                    db.session.add(article)
                progress_bar(i/(len(articles) - 1) * 100)

        print('Loaded {0} sources, {1} feeds, and {2} articles.'.format(len(sources), len(feeds), len(articles)))
        print('Done!')

        if patcher is not None:
            patcher.stop()