Example #1
0
def TestInput(data):
    fdp = atheris.FuzzedDataProvider(data)

    try:
        ftfy.fix_text(fdp.ConsumeString(1000))
        ftfy.fix_text(fdp.ConsumeUnicode(1000))

        plan1 = ftfy.fix_and_explain(fdp.ConsumeString(1000))[1]
        plan2 = ftfy.fix_and_explain(fdp.ConsumeUnicode(1000))[1]
        ftfy.apply_plan(fdp.ConsumeString(1000), plan1)
        ftfy.apply_plan(fdp.ConsumeString(1000), plan2)
        ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan1)
        ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan2)

        ftfy.fix_text_segment(fdp.ConsumeString(1000))
        ftfy.fix_text_segment(fdp.ConsumeUnicode(1000))

        f = open("temp.txt", "w")
        f.write(fdp.ConsumeString(1000))
        f.write(fdp.ConsumeUnicode(1000))
        f.close()
        f = open("temp.txt", "r")
        ftfy.fix_file(f)
        f.close()

        ftfy.guess_bytes(fdp.ConsumeBytes(1000))
    except UnicodeError as e:
        if "Hey wait, this isn't Unicode." not in str(e):
            raise e
Example #2
0
def fix_file_encoding(in_file, out_file):
    """Fix unicode encoding to ensure proper display."""
    stream = fix_file(
        in_file,
        encoding=None,
        fix_entities=False,
        remove_terminal_escapes=False,
        fix_encoding=True,
        fix_latin_ligatures=False,
        fix_character_width=False,
        uncurl_quotes=False,
        fix_line_breaks=False,
        fix_surrogates=False,
        remove_control_chars=False,
        remove_bom=False,
        normalization="NFC",
    )
    stream_iterator = iter(stream)
    while stream_iterator:
        try:
            line = next(stream_iterator)
            out_file.write(line)
        except StopIteration:
            break
    output_file.close()
Example #3
0
def count_charclasses(fn, fix_unicode=False):
    """Returns character class Counter for header and body of file fn"""
    bcounts, bcountsH = None, None

    with open(fn, 'rb') as fin:
        if fix_unicode:
            guess = guess_encoding(fn)
            fin_fixed = ftfy.fix_file(fin, encoding=guess)
            with iterable_to_stream(fin_fixed) as bffr:
                bcountsH = Counter(bffr.readlines(1)[0])
                bcounts = Counter(bffr.read())
        else:
            bcountsH = Counter(fin.readlines(1)[0])
            bcounts = Counter(fin.read())

    ccountsH = {(get_charclass(chr(k)), chr(k), k): n
                for (k, n) in bcountsH.items()}
    ccounts = {(get_charclass(chr(k)), chr(k), k): n
               for (k, n) in bcounts.items()}

    charclassesH = Counter()
    for k, v in ccountsH.items():
        charclassesH[k[0]] += v
    charclasses = Counter()
    for k, v in ccounts.items():
        charclasses[k[0]] += v
    return (charclassesH, charclasses)
Example #4
0
def get_df_raw(fn, fix_unicode=False):
    """Return a dataframe of character string types.

    Useful if you don't want to let Pandas automatically determine the data
    types, for example, in first steps of input data evaluation.

    WARNING: using fix_unicode=True is very slow!  Might be better to fix and
    copy (see fix_unicode_and_copy) the file for future use, if required.
    In my experieence, it is relatively rare to have to do this anyway.
    """
    LOGNAME = '%s:%s' % (os.path.basename(__file__), 'get_df_raw()')
    log = get_logger(LOGNAME)
    guess = guess_encoding(fn)
    with open(fn, 'rb') as fin:
        if fix_unicode:
            log.warn('! Fixing unicode: This may take some time!')
            log.info('creating unicode generator')
            fin_fixed = ftfy.fix_file(fin, encoding=guess)
            log.info('done creating unicode generator')
            with iterable_to_stream(fin_fixed) as bffr:
                t0 = mstime()
                df = pd.read_csv(bffr, encoding='utf8', dtype=str)
                t1 = mstime()
                log.info('created dataframe from fixed unicode of ' +
                         '%s: %d x %d (%d msecs)' %
                         (fn, len(df), len(df.columns), t1 - t0))
                return df
        else:
            t0 = mstime()
            df = pd.read_csv(fin, encoding=guess, dtype=str)
            t1 = mstime()
            log.info('created dataframe ' + '%s: %d x %d (%d msecs)' %
                     (fn, len(df), len(df.columns), t1 - t0))
            return df
Example #5
0
def fix_unicode_and_copy(fn_i, fn_o):
    """Fix unicode of file fn_i and copy to fn_o."""
    guess = guess_encoding(fn_i)
    if guess != 'UTF-8':
        with open(fn_o, 'w', encoding='utf8') as fout, open(fn_i, 'rb') as fin:
            for line in ftfy.fix_file(fin, encoding=guess):
                fout.write(line)
    else:
        shutil.copyfile(fn_i, fn_o)
def fix_encoding(in_path, in_encoding, out_encoding='utf8'):
    """Attempt to clean up some of the more common encoding screw-ups."""
    in_fh = codecs.open(in_path, 'r+', in_encoding, errors='ignore')
    in_name = in_fh.name
    tmp_name = os.path.join(os.path.dirname(in_fh.name), 'converting.tmp')
    out_fh = codecs.open(tmp_name, 'w+', out_encoding)

    with in_fh, out_fh:
        for line in fix_file(in_fh):
            out_fh.write(line)
    os.rename(tmp_name, in_name)
Example #7
0
    def open_file(self) -> Iterable:

        # first load policies
        print("##### Loading SHERPA/ROMEO policies...", file=sys.stderr)
        fixed_policy_file = ftfy.fix_file(
            open(self.config.sherpa_romeo_policies_simple.filepath, "rb"))
        policy_reader = csv.DictReader(fixed_policy_file)
        for row in policy_reader:
            self.sherpa_policies[row["RoMEO Record ID"]] = row

        # then open regular file
        raw_file = (open(self.config.sherpa_romeo_journals_simple.filepath,
                         "rb").read().decode(errors="replace"))
        fixed_file = ftfy.fix_text(raw_file)
        return csv.DictReader(fixed_file.split("\n"))
Example #8
0
def main():
    """
    Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
    the 'argparse' module.)
    """
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("filename", help="file to transcode")

    args = parser.parse_args()

    file = open(args.filename)
    for line in fix_file(file):
        if ENCODE_STDOUT:
            sys.stdout.write(line.encode("utf-8"))
        else:
            sys.stdout.write(line)
Example #9
0
def main():
    """
    Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
    the 'argparse' module.)
    """
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='file to transcode')

    args = parser.parse_args()

    file = open(args.filename)
    for line in fix_file(file):
        if ENCODE_STDOUT:
            sys.stdout.write(line.encode('utf-8'))
        else:
            sys.stdout.write(line)
Example #10
0
def main():
    """
    Run ftfy as a command-line utility. (Requires Python 2.7 or later, or
    the 'argparse' module.)
    """
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='file to transcode')

    args = parser.parse_args()

    # Why open in Latin-1? Because it at least won't make encoding problems
    # worse, and we're about to make things better.
    file = codecs.open(args.filename, encoding='latin-1')
    for line in fix_file(file):
        if ENCODE_STDOUT:
            sys.stdout.write(line.encode('utf-8'))
        else:
            sys.stdout.write(line)
Example #11
0
 def load_sherpa_romeo(self, journal_path, policy_path):
     # first load policies
     print("##### Loading SHERPA/ROMEO policies...")
     #RoMEO Record ID,Publisher,Policy Heading,Country,RoMEO colour,Published Permission,Published Restrictions,Published Max embargo,Accepted Prmission,Accepted Restrictions,Accepted Max embargo,Submitted Permission,Submitted Restrictions,Submitted Max embargo,Open Access Publishing,Record Status,Updated
     policies = dict()
     fixed_policy_file = ftfy.fix_file(open(policy_path, 'rb'))
     policy_reader = csv.DictReader(fixed_policy_file)
     for row in policy_reader:
         policies[row['RoMEO Record ID']] = row
     print("##### Loading SHERPA/ROMEO journal metadata...")
     #Journal Title,ISSN,ESSN,URL,RoMEO Record ID,Updated
     # super mangled :(
     raw_file = open(journal_path, 'rb').read().decode(errors='replace')
     fixed_file = ftfy.fix_text(raw_file)
     reader = csv.DictReader(fixed_file.split('\n'))
     counts = Counter()
     for row in reader:
         #row['Journal Title'] = row.pop('\ufeffJournal Title')
         row.update(policies[row['RoMEO Record ID']])
         issnl, status = self.add_issn(
             issnp=row['ISSN'],
             issne=row['ESSN'],
             name=row['Journal Title'],
             publisher=row['Publisher'],
         )
         counts[status] += 1
         if not issnl:
             continue
         d = self.data[issnl]
         sherpa_romeo = dict()
         if row['RoMEO colour']:
             sherpa_romeo['color'] = row['RoMEO colour']
         # row['Open Access Publishing']
         if row['Country']:
             self.add_country(issnl, row['Country'])
         self.data[issnl]['sherpa_romeo'] = sherpa_romeo
     print(counts)
Example #12
0
def main():
    """
    Run ftfy as a command-line utility.
    """
    import argparse

    parser = argparse.ArgumentParser(
        description="ftfy (fixes text for you), version %s" % __version__
    )
    parser.add_argument('filename', default='-', nargs='?',
                        help='The file whose Unicode is to be fixed. Defaults '
                             'to -, meaning standard input.')
    parser.add_argument('-o', '--output', type=str, default='-',
                        help='The file to output to. Defaults to -, meaning '
                             'standard output.')
    parser.add_argument('-g', '--guess', action='store_true',
                        help="Ask ftfy to guess the encoding of your input. "
                             "This is risky. Overrides -e.")
    parser.add_argument('-e', '--encoding', type=str, default='utf-8',
                        help='The encoding of the input. Defaults to UTF-8.')
    parser.add_argument('-n', '--normalization', type=str, default='NFC',
                        help='The normalization of Unicode to apply. '
                             'Defaults to NFC. Can be "none".')
    parser.add_argument('--preserve-entities', action='store_true',
                        help="Leave HTML entities as they are. The default "
                             "is to decode them, as long as no HTML tags "
                             "have appeared in the file.")

    args = parser.parse_args()

    encoding = args.encoding
    if args.guess:
        encoding = None

    if args.filename == '-':
        # Get a standard input stream made of bytes, so we can decode it as
        # whatever encoding is necessary.
        file = sys.stdin.buffer
    else:
        file = open(args.filename, 'rb')

    if args.output == '-':
        outfile = sys.stdout
    else:
        if os.path.realpath(args.output) == os.path.realpath(args.filename):
            sys.stderr.write(SAME_FILE_ERROR_TEXT)
            sys.exit(1)
        outfile = open(args.output, 'w', encoding='utf-8')

    normalization = args.normalization
    if normalization.lower() == 'none':
        normalization = None

    if args.preserve_entities:
        fix_entities = False
    else:
        fix_entities = 'auto'

    try:
        for line in fix_file(file, encoding=encoding,
                             fix_entities=fix_entities,
                             normalization=normalization):
            try:
                outfile.write(line)
            except UnicodeEncodeError:
                if sys.platform == 'win32':
                    sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
                else:
                    sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
                sys.exit(1)
    except UnicodeDecodeError as err:
        sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err))
        sys.exit(1)
Example #13
0
def process_story(source, url, date, metadata):
    '''
    Opens the URL and scrapes the relevant text as per the rules in SOURCE_RULES.
    Returns a dict w/ the headline and story content (plus passed-through date/metadata) if it worked; otherwise returns None.
    Also computes a simplistic regex-based word-count as a quality check to compare with GDELT wordcount, plus some other quality warnings.
    '''
    noncritical_warnings = {}
    # initial sanity check on URL embedded date code
    url_date_code = re.search(r'/\d{4}/(\d{2}|\w{3})/(\d{1,2}/)?', url)
    if url_date_code:
        url_date_code = parse_dt(url_date_code.group(0), ignoretz=True)
        gkg_date_code = datetime.datetime.strptime(date, '%Y%m%d%H%M%S')
        diff = gkg_date_code - url_date_code
        if abs(diff.days) > 31:
            print '+' * 20
            print 'WARNING: Date-code embedded in URL differs from date-code provided by GKG by {} days! URL-implied date is {}. Skipping {}.'.format(
                diff.days, url_date_code, url)
            print '+' * 20
            return None

    # wait a bit to avoid getting blocked
    time.sleep(2)
    # open the URL and read the data
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    opener.addheaders = [('User-Agent', 'news scraper for mona project')]
    try:
        # wrap file obj in ftfy.fix_file() to organize unicode
        content = ''.join([x for x in ftfy.fix_file(opener.open(url))])
    except urllib2.HTTPError as e:
        print '+' * 20
        print 'WARNING: HTTP error for "{}": {} - {}. Skipping.'.format(
            url, e.code, e.reason)
        print '+' * 20
        return None
    except urllib2.URLError as e:
        print '+' * 20
        print 'WARNING: URL error for "{}": {}. Skipping.'.format(
            url, e.reason)
        print '+' * 20
        return None
    except Exception as e:
        print '+' * 20
        print 'WARNING: Unexpected exception for "{}": {}. Skipping.'.format(
            url, e.message)
        print '+' * 20
        return None

    # parse the HTML tree
    try:
        tree = html.fromstring(content)
    except Exception:
        print '+' * 20
        print 'WARNING: lxml was unable to parse HTML tree for "{}". Skipping.'.format(
            url)
        print '+' * 20
        return None

    # translate <br> to \n
    for br in tree.xpath('*//br'):
        br.tail = '\n\n' + br.tail if br.tail else '\n\n'

    # apply source-specific preprocessing to the tree
    if SOURCE_RULES[source].get('tree_preprocessor'):
        tree = SOURCE_RULES[source]['tree_preprocessor'](tree)

    # if we didn't figure out the pub date earlier, do it now
    if not url_date_code and not SOURCE_RULES[source].get('timestamp_xpath'):
        print '*' * 20
        print 'No date code in URL and no timestamp xpath rule defined! Skipping {}'.format(
            url)
        print '*' * 20
        return None
    elif not url_date_code and SOURCE_RULES[source].get('timestamp_xpath'):
        article_date_els = tree.xpath(SOURCE_RULES[source]['timestamp_xpath'])
        article_date_string = ' '.join(
            [e.text_content() for e in article_date_els])
        if not article_date_string.strip():
            print '+' * 20
            print 'WARNING: No publication date could be found! Skipping {}.'.format(
                url)
            print '+' * 20
            return None
        try:
            article_date = parse_dt(article_date_string,
                                    fuzzy=True,
                                    ignoretz=True)
        except:
            print '+' * 20
            print 'WARNING: Unable to evaluate article publication date! No sanity check possible. Skipping {}.'.format(
                url)
            print '+' * 20
            return None
        gkg_date_code = datetime.datetime.strptime(date, '%Y%m%d%H%M%S')
        diff = gkg_date_code - article_date
        if abs(diff.days) > 31:
            print '+' * 20
            print 'WARNING: Date-code embedded in article differs from date-code provided by GKG by {} days! Article date is {}. Skipping {}.'.format(
                diff.days, article_date, url)
            print '+' * 20
            return None

    # read headline using xpath
    # if necs, adapt to any known-naughty URLs which require special rules
    if SOURCE_RULES[source].get('naughty_list',
                                {}).get(url, {}).get('headline_xpath', {}):
        headline_xpath = SOURCE_RULES[source]['naughty_list'][url][
            'headline_xpath']
    else:
        headline_xpath = SOURCE_RULES[source]['headline_xpath']
    if headline_xpath:
        headline = '\n\n'.join(
            [e.text_content().lstrip() for e in tree.xpath(headline_xpath)])
    else:
        print 'No headline rule defined for source "{}", skipping "{}"'.format(
            source, url)
        return None
    if DEBUG:
        print '*' * 20
        print url
        print '-' * 20
        print headline

    # read story content using xpath
    # if necs, adapt to any known-naughty URLs which require special rules
    if SOURCE_RULES[source].get('naughty_list',
                                {}).get(url, {}).get('content_xpath', {}):
        content_xpath = SOURCE_RULES[source]['naughty_list'][url][
            'content_xpath']
    else:
        content_xpath = SOURCE_RULES[source]['content_xpath']

    # clean up whitespace by replacing all tabs or spaces (incl the nbsp, \xa0) from the text with a single space
    if content_xpath:
        text_blocks = [
            re.sub(r'[\t\xa0 ]+', ' ', e.text_content())
            for e in tree.xpath(content_xpath)
        ]
    else:
        print 'No content rule defined for source "{}", skipping "{}"'.format(
            source, url)
        return None
    story_content = '\n\n'.join(text_blocks)
    if DEBUG:
        print '-' * 20
        print story_content
        print '*' * 20

    # find repetitive blocks of text and add warning if necs
    rep_count = len([t for t in text_blocks if t.strip()]) - len(
        set([t for t in text_blocks if t.strip()]))
    if rep_count:
        noncritical_warnings['repetitive_text_block_count'] = rep_count
        if DEBUG:
            print '^' * 20
            print 'NONCRITICAL: {} repetitive text blocks were found! Very suspicious, consider filtering {}.'.format(
                rep_count, url)
            print '^' * 20

    # exclude any data with empty headline or empty story_content
    if not headline or not story_content:
        print '+' * 20
        print 'WARNING: Headline or story from "{}" was blank; excluding from output.'.format(
            url)
        print '+' * 20
        return None

    # copy the metadata
    try:
        metadict = eval(metadata)
        assert type(metadict) is dict
    except AssertionError as e:
        print '+' * 20
        print 'WARNING: Metadata string "{}" does not evaluate to a dict. Skipping "{}".'.format(
            metadata, url)
        print '+' * 20
        return None
    except Exception as e:
        print '+' * 20
        print 'WARNING: Unexpected error evaluating metadata "{}". Skipping "{}".'.format(
            metadata, url)
        print '+' * 20
        return None

    # some additional sanity checking for character encoding
    strangechars = defaultdict(list)
    for i, c in enumerate(headline + story_content):
        if '\\x' in repr(c) or '\\00' in repr(c):
            strangechars[c].append(i)
    if strangechars:
        strangechars = dict(strangechars)
        noncritical_warnings['suspicious_characters'] = strangechars
        if DEBUG:
            print '^' * 20
            print 'NONCRITICAL: Found potentially-suspicous characters: {} ("{}")'.format(
                strangechars, url)
            print '^' * 20

    # regex wordcount as a sanity check vs. gdelt wordcount (black box)
    regex_wordcount = len(
        re.findall(
            r'\w+',
            ''.join([c for c in story_content
                     if c not in string.punctuation])))
    if not metadict.get('wordcount'):
        noncritical_warnings['wordcount_mismatch'] = {
            'gdelt': None,
            'scraper_regex': regex_wordcount
        }
        if DEBUG:
            print '^' * 20
            print 'NONCRITICAL: Metadata missing value for "wordcount" so no sanity-check possible. ({})'.format(
                url)
            print '^' * 20
    elif abs(regex_wordcount - metadict['wordcount']) / float(
            metadict['wordcount']) > 0.05:
        noncritical_warnings['wordcount_mismatch'] = {
            'gdelt': metadict['wordcount'],
            'scraper_regex': regex_wordcount
        }
        if DEBUG:
            print '^' * 20
            print 'NONCRITICAL: GDELT reports {} words but scraped content contains approximately {} ({})'.format(
                metadict['wordcount'], regex_wordcount, url)
            print '^' * 20

    # if we've made it this far w/o returning None, everything's good to go
    return {
        'headline': headline,
        'story_content': story_content,
        'date': date,
        'metadata': metadict,
        'wordcount_as_scraped': regex_wordcount,
        'warnings': noncritical_warnings
    }
Example #14
0
def main():
    """
    Run ftfy as a command-line utility.
    """
    import argparse

    parser = argparse.ArgumentParser(
        description="ftfy (fixes text for you), version %s" % __version__
    )
    parser.add_argument(
        "filename",
        default="-",
        nargs="?",
        help="The file whose Unicode is to be fixed. Defaults "
        "to -, meaning standard input.",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        default="-",
        help="The file to output to. Defaults to -, meaning " "standard output.",
    )
    parser.add_argument(
        "-g",
        "--guess",
        action="store_true",
        help="Ask ftfy to guess the encoding of your input. "
        "This is risky. Overrides -e.",
    )
    parser.add_argument(
        "-e",
        "--encoding",
        type=str,
        default="utf-8",
        help="The encoding of the input. Defaults to UTF-8.",
    )
    parser.add_argument(
        "-n",
        "--normalization",
        type=str,
        default="NFC",
        help="The normalization of Unicode to apply. "
        'Defaults to NFC. Can be "none".',
    )
    parser.add_argument(
        "--preserve-entities",
        action="store_true",
        help="Leave HTML entities as they are. The default "
        "is to decode them, as long as no HTML tags "
        "have appeared in the file.",
    )

    args = parser.parse_args()

    encoding = args.encoding
    if args.guess:
        encoding = None

    if args.filename == "-":
        # Get a standard input stream made of bytes, so we can decode it as
        # whatever encoding is necessary.
        file = sys.stdin.buffer
    else:
        file = open(args.filename, "rb")

    if args.output == "-":
        outfile = sys.stdout
    else:
        if os.path.realpath(args.output) == os.path.realpath(args.filename):
            sys.stderr.write(SAME_FILE_ERROR_TEXT)
            sys.exit(1)
        outfile = open(args.output, "w", encoding="utf-8")

    normalization = args.normalization
    if normalization.lower() == "none":
        normalization = None

    if args.preserve_entities:
        unescape_html = False
    else:
        unescape_html = "auto"

    config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization)

    try:
        for line in fix_file(file, encoding=encoding, config=config):
            try:
                outfile.write(line)
            except UnicodeEncodeError:
                if sys.platform == "win32":
                    sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
                else:
                    sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
                sys.exit(1)
    except UnicodeDecodeError as err:
        sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err))
        sys.exit(1)
Example #15
0
"""
This file uses ftfy to fix broken unicode in Python files, strips punctuation, removes numbers.
"""
import ftfy
import sys
import codecs
for line in ftfy.fix_file(
        codecs.open('new-latin1.txt', errors='ignore', encoding='latin1')):
    if line.strip().startswith('['):
        continue
    line = line.replace(",", "").replace("?", "").replace(".", "").replace(
        "!", "").replace(";", "").replace("-", "").replace("'", " ").lower()
    line = ''.join([i for i in line if not i.isdigit()])  #remove numbers
    line = line.strip()
    if line == '':
        continue
    print(line.replace("\n", ""))