def TestInput(data): fdp = atheris.FuzzedDataProvider(data) try: ftfy.fix_text(fdp.ConsumeString(1000)) ftfy.fix_text(fdp.ConsumeUnicode(1000)) plan1 = ftfy.fix_and_explain(fdp.ConsumeString(1000))[1] plan2 = ftfy.fix_and_explain(fdp.ConsumeUnicode(1000))[1] ftfy.apply_plan(fdp.ConsumeString(1000), plan1) ftfy.apply_plan(fdp.ConsumeString(1000), plan2) ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan1) ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan2) ftfy.fix_text_segment(fdp.ConsumeString(1000)) ftfy.fix_text_segment(fdp.ConsumeUnicode(1000)) f = open("temp.txt", "w") f.write(fdp.ConsumeString(1000)) f.write(fdp.ConsumeUnicode(1000)) f.close() f = open("temp.txt", "r") ftfy.fix_file(f) f.close() ftfy.guess_bytes(fdp.ConsumeBytes(1000)) except UnicodeError as e: if "Hey wait, this isn't Unicode." not in str(e): raise e
def fix_file_encoding(in_file, out_file): """Fix unicode encoding to ensure proper display.""" stream = fix_file( in_file, encoding=None, fix_entities=False, remove_terminal_escapes=False, fix_encoding=True, fix_latin_ligatures=False, fix_character_width=False, uncurl_quotes=False, fix_line_breaks=False, fix_surrogates=False, remove_control_chars=False, remove_bom=False, normalization="NFC", ) stream_iterator = iter(stream) while stream_iterator: try: line = next(stream_iterator) out_file.write(line) except StopIteration: break output_file.close()
def count_charclasses(fn, fix_unicode=False): """Returns character class Counter for header and body of file fn""" bcounts, bcountsH = None, None with open(fn, 'rb') as fin: if fix_unicode: guess = guess_encoding(fn) fin_fixed = ftfy.fix_file(fin, encoding=guess) with iterable_to_stream(fin_fixed) as bffr: bcountsH = Counter(bffr.readlines(1)[0]) bcounts = Counter(bffr.read()) else: bcountsH = Counter(fin.readlines(1)[0]) bcounts = Counter(fin.read()) ccountsH = {(get_charclass(chr(k)), chr(k), k): n for (k, n) in bcountsH.items()} ccounts = {(get_charclass(chr(k)), chr(k), k): n for (k, n) in bcounts.items()} charclassesH = Counter() for k, v in ccountsH.items(): charclassesH[k[0]] += v charclasses = Counter() for k, v in ccounts.items(): charclasses[k[0]] += v return (charclassesH, charclasses)
def get_df_raw(fn, fix_unicode=False): """Return a dataframe of character string types. Useful if you don't want to let Pandas automatically determine the data types, for example, in first steps of input data evaluation. WARNING: using fix_unicode=True is very slow! Might be better to fix and copy (see fix_unicode_and_copy) the file for future use, if required. In my experieence, it is relatively rare to have to do this anyway. """ LOGNAME = '%s:%s' % (os.path.basename(__file__), 'get_df_raw()') log = get_logger(LOGNAME) guess = guess_encoding(fn) with open(fn, 'rb') as fin: if fix_unicode: log.warn('! Fixing unicode: This may take some time!') log.info('creating unicode generator') fin_fixed = ftfy.fix_file(fin, encoding=guess) log.info('done creating unicode generator') with iterable_to_stream(fin_fixed) as bffr: t0 = mstime() df = pd.read_csv(bffr, encoding='utf8', dtype=str) t1 = mstime() log.info('created dataframe from fixed unicode of ' + '%s: %d x %d (%d msecs)' % (fn, len(df), len(df.columns), t1 - t0)) return df else: t0 = mstime() df = pd.read_csv(fin, encoding=guess, dtype=str) t1 = mstime() log.info('created dataframe ' + '%s: %d x %d (%d msecs)' % (fn, len(df), len(df.columns), t1 - t0)) return df
def fix_unicode_and_copy(fn_i, fn_o): """Fix unicode of file fn_i and copy to fn_o.""" guess = guess_encoding(fn_i) if guess != 'UTF-8': with open(fn_o, 'w', encoding='utf8') as fout, open(fn_i, 'rb') as fin: for line in ftfy.fix_file(fin, encoding=guess): fout.write(line) else: shutil.copyfile(fn_i, fn_o)
def fix_encoding(in_path, in_encoding, out_encoding='utf8'): """Attempt to clean up some of the more common encoding screw-ups.""" in_fh = codecs.open(in_path, 'r+', in_encoding, errors='ignore') in_name = in_fh.name tmp_name = os.path.join(os.path.dirname(in_fh.name), 'converting.tmp') out_fh = codecs.open(tmp_name, 'w+', out_encoding) with in_fh, out_fh: for line in fix_file(in_fh): out_fh.write(line) os.rename(tmp_name, in_name)
def open_file(self) -> Iterable: # first load policies print("##### Loading SHERPA/ROMEO policies...", file=sys.stderr) fixed_policy_file = ftfy.fix_file( open(self.config.sherpa_romeo_policies_simple.filepath, "rb")) policy_reader = csv.DictReader(fixed_policy_file) for row in policy_reader: self.sherpa_policies[row["RoMEO Record ID"]] = row # then open regular file raw_file = (open(self.config.sherpa_romeo_journals_simple.filepath, "rb").read().decode(errors="replace")) fixed_file = ftfy.fix_text(raw_file) return csv.DictReader(fixed_file.split("\n"))
def main(): """ Run ftfy as a command-line utility. (Requires Python 2.7 or later, or the 'argparse' module.) """ import argparse parser = argparse.ArgumentParser() parser.add_argument("filename", help="file to transcode") args = parser.parse_args() file = open(args.filename) for line in fix_file(file): if ENCODE_STDOUT: sys.stdout.write(line.encode("utf-8")) else: sys.stdout.write(line)
def main(): """ Run ftfy as a command-line utility. (Requires Python 2.7 or later, or the 'argparse' module.) """ import argparse parser = argparse.ArgumentParser() parser.add_argument('filename', help='file to transcode') args = parser.parse_args() file = open(args.filename) for line in fix_file(file): if ENCODE_STDOUT: sys.stdout.write(line.encode('utf-8')) else: sys.stdout.write(line)
def main(): """ Run ftfy as a command-line utility. (Requires Python 2.7 or later, or the 'argparse' module.) """ import argparse parser = argparse.ArgumentParser() parser.add_argument('filename', help='file to transcode') args = parser.parse_args() # Why open in Latin-1? Because it at least won't make encoding problems # worse, and we're about to make things better. file = codecs.open(args.filename, encoding='latin-1') for line in fix_file(file): if ENCODE_STDOUT: sys.stdout.write(line.encode('utf-8')) else: sys.stdout.write(line)
def load_sherpa_romeo(self, journal_path, policy_path): # first load policies print("##### Loading SHERPA/ROMEO policies...") #RoMEO Record ID,Publisher,Policy Heading,Country,RoMEO colour,Published Permission,Published Restrictions,Published Max embargo,Accepted Prmission,Accepted Restrictions,Accepted Max embargo,Submitted Permission,Submitted Restrictions,Submitted Max embargo,Open Access Publishing,Record Status,Updated policies = dict() fixed_policy_file = ftfy.fix_file(open(policy_path, 'rb')) policy_reader = csv.DictReader(fixed_policy_file) for row in policy_reader: policies[row['RoMEO Record ID']] = row print("##### Loading SHERPA/ROMEO journal metadata...") #Journal Title,ISSN,ESSN,URL,RoMEO Record ID,Updated # super mangled :( raw_file = open(journal_path, 'rb').read().decode(errors='replace') fixed_file = ftfy.fix_text(raw_file) reader = csv.DictReader(fixed_file.split('\n')) counts = Counter() for row in reader: #row['Journal Title'] = row.pop('\ufeffJournal Title') row.update(policies[row['RoMEO Record ID']]) issnl, status = self.add_issn( issnp=row['ISSN'], issne=row['ESSN'], name=row['Journal Title'], publisher=row['Publisher'], ) counts[status] += 1 if not issnl: continue d = self.data[issnl] sherpa_romeo = dict() if row['RoMEO colour']: sherpa_romeo['color'] = row['RoMEO colour'] # row['Open Access Publishing'] if row['Country']: self.add_country(issnl, row['Country']) self.data[issnl]['sherpa_romeo'] = sherpa_romeo print(counts)
def main(): """ Run ftfy as a command-line utility. """ import argparse parser = argparse.ArgumentParser( description="ftfy (fixes text for you), version %s" % __version__ ) parser.add_argument('filename', default='-', nargs='?', help='The file whose Unicode is to be fixed. Defaults ' 'to -, meaning standard input.') parser.add_argument('-o', '--output', type=str, default='-', help='The file to output to. Defaults to -, meaning ' 'standard output.') parser.add_argument('-g', '--guess', action='store_true', help="Ask ftfy to guess the encoding of your input. " "This is risky. Overrides -e.") parser.add_argument('-e', '--encoding', type=str, default='utf-8', help='The encoding of the input. Defaults to UTF-8.') parser.add_argument('-n', '--normalization', type=str, default='NFC', help='The normalization of Unicode to apply. ' 'Defaults to NFC. Can be "none".') parser.add_argument('--preserve-entities', action='store_true', help="Leave HTML entities as they are. The default " "is to decode them, as long as no HTML tags " "have appeared in the file.") args = parser.parse_args() encoding = args.encoding if args.guess: encoding = None if args.filename == '-': # Get a standard input stream made of bytes, so we can decode it as # whatever encoding is necessary. file = sys.stdin.buffer else: file = open(args.filename, 'rb') if args.output == '-': outfile = sys.stdout else: if os.path.realpath(args.output) == os.path.realpath(args.filename): sys.stderr.write(SAME_FILE_ERROR_TEXT) sys.exit(1) outfile = open(args.output, 'w', encoding='utf-8') normalization = args.normalization if normalization.lower() == 'none': normalization = None if args.preserve_entities: fix_entities = False else: fix_entities = 'auto' try: for line in fix_file(file, encoding=encoding, fix_entities=fix_entities, normalization=normalization): try: outfile.write(line) except UnicodeEncodeError: if sys.platform == 'win32': sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS) else: sys.stderr.write(ENCODE_ERROR_TEXT_UNIX) sys.exit(1) except UnicodeDecodeError as err: sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err)) sys.exit(1)
def process_story(source, url, date, metadata): ''' Opens the URL and scrapes the relevant text as per the rules in SOURCE_RULES. Returns a dict w/ the headline and story content (plus passed-through date/metadata) if it worked; otherwise returns None. Also computes a simplistic regex-based word-count as a quality check to compare with GDELT wordcount, plus some other quality warnings. ''' noncritical_warnings = {} # initial sanity check on URL embedded date code url_date_code = re.search(r'/\d{4}/(\d{2}|\w{3})/(\d{1,2}/)?', url) if url_date_code: url_date_code = parse_dt(url_date_code.group(0), ignoretz=True) gkg_date_code = datetime.datetime.strptime(date, '%Y%m%d%H%M%S') diff = gkg_date_code - url_date_code if abs(diff.days) > 31: print '+' * 20 print 'WARNING: Date-code embedded in URL differs from date-code provided by GKG by {} days! URL-implied date is {}. Skipping {}.'.format( diff.days, url_date_code, url) print '+' * 20 return None # wait a bit to avoid getting blocked time.sleep(2) # open the URL and read the data opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) opener.addheaders = [('User-Agent', 'news scraper for mona project')] try: # wrap file obj in ftfy.fix_file() to organize unicode content = ''.join([x for x in ftfy.fix_file(opener.open(url))]) except urllib2.HTTPError as e: print '+' * 20 print 'WARNING: HTTP error for "{}": {} - {}. Skipping.'.format( url, e.code, e.reason) print '+' * 20 return None except urllib2.URLError as e: print '+' * 20 print 'WARNING: URL error for "{}": {}. Skipping.'.format( url, e.reason) print '+' * 20 return None except Exception as e: print '+' * 20 print 'WARNING: Unexpected exception for "{}": {}. Skipping.'.format( url, e.message) print '+' * 20 return None # parse the HTML tree try: tree = html.fromstring(content) except Exception: print '+' * 20 print 'WARNING: lxml was unable to parse HTML tree for "{}". Skipping.'.format( url) print '+' * 20 return None # translate <br> to \n for br in tree.xpath('*//br'): br.tail = '\n\n' + br.tail if br.tail else '\n\n' # apply source-specific preprocessing to the tree if SOURCE_RULES[source].get('tree_preprocessor'): tree = SOURCE_RULES[source]['tree_preprocessor'](tree) # if we didn't figure out the pub date earlier, do it now if not url_date_code and not SOURCE_RULES[source].get('timestamp_xpath'): print '*' * 20 print 'No date code in URL and no timestamp xpath rule defined! Skipping {}'.format( url) print '*' * 20 return None elif not url_date_code and SOURCE_RULES[source].get('timestamp_xpath'): article_date_els = tree.xpath(SOURCE_RULES[source]['timestamp_xpath']) article_date_string = ' '.join( [e.text_content() for e in article_date_els]) if not article_date_string.strip(): print '+' * 20 print 'WARNING: No publication date could be found! Skipping {}.'.format( url) print '+' * 20 return None try: article_date = parse_dt(article_date_string, fuzzy=True, ignoretz=True) except: print '+' * 20 print 'WARNING: Unable to evaluate article publication date! No sanity check possible. Skipping {}.'.format( url) print '+' * 20 return None gkg_date_code = datetime.datetime.strptime(date, '%Y%m%d%H%M%S') diff = gkg_date_code - article_date if abs(diff.days) > 31: print '+' * 20 print 'WARNING: Date-code embedded in article differs from date-code provided by GKG by {} days! Article date is {}. Skipping {}.'.format( diff.days, article_date, url) print '+' * 20 return None # read headline using xpath # if necs, adapt to any known-naughty URLs which require special rules if SOURCE_RULES[source].get('naughty_list', {}).get(url, {}).get('headline_xpath', {}): headline_xpath = SOURCE_RULES[source]['naughty_list'][url][ 'headline_xpath'] else: headline_xpath = SOURCE_RULES[source]['headline_xpath'] if headline_xpath: headline = '\n\n'.join( [e.text_content().lstrip() for e in tree.xpath(headline_xpath)]) else: print 'No headline rule defined for source "{}", skipping "{}"'.format( source, url) return None if DEBUG: print '*' * 20 print url print '-' * 20 print headline # read story content using xpath # if necs, adapt to any known-naughty URLs which require special rules if SOURCE_RULES[source].get('naughty_list', {}).get(url, {}).get('content_xpath', {}): content_xpath = SOURCE_RULES[source]['naughty_list'][url][ 'content_xpath'] else: content_xpath = SOURCE_RULES[source]['content_xpath'] # clean up whitespace by replacing all tabs or spaces (incl the nbsp, \xa0) from the text with a single space if content_xpath: text_blocks = [ re.sub(r'[\t\xa0 ]+', ' ', e.text_content()) for e in tree.xpath(content_xpath) ] else: print 'No content rule defined for source "{}", skipping "{}"'.format( source, url) return None story_content = '\n\n'.join(text_blocks) if DEBUG: print '-' * 20 print story_content print '*' * 20 # find repetitive blocks of text and add warning if necs rep_count = len([t for t in text_blocks if t.strip()]) - len( set([t for t in text_blocks if t.strip()])) if rep_count: noncritical_warnings['repetitive_text_block_count'] = rep_count if DEBUG: print '^' * 20 print 'NONCRITICAL: {} repetitive text blocks were found! Very suspicious, consider filtering {}.'.format( rep_count, url) print '^' * 20 # exclude any data with empty headline or empty story_content if not headline or not story_content: print '+' * 20 print 'WARNING: Headline or story from "{}" was blank; excluding from output.'.format( url) print '+' * 20 return None # copy the metadata try: metadict = eval(metadata) assert type(metadict) is dict except AssertionError as e: print '+' * 20 print 'WARNING: Metadata string "{}" does not evaluate to a dict. Skipping "{}".'.format( metadata, url) print '+' * 20 return None except Exception as e: print '+' * 20 print 'WARNING: Unexpected error evaluating metadata "{}". Skipping "{}".'.format( metadata, url) print '+' * 20 return None # some additional sanity checking for character encoding strangechars = defaultdict(list) for i, c in enumerate(headline + story_content): if '\\x' in repr(c) or '\\00' in repr(c): strangechars[c].append(i) if strangechars: strangechars = dict(strangechars) noncritical_warnings['suspicious_characters'] = strangechars if DEBUG: print '^' * 20 print 'NONCRITICAL: Found potentially-suspicous characters: {} ("{}")'.format( strangechars, url) print '^' * 20 # regex wordcount as a sanity check vs. gdelt wordcount (black box) regex_wordcount = len( re.findall( r'\w+', ''.join([c for c in story_content if c not in string.punctuation]))) if not metadict.get('wordcount'): noncritical_warnings['wordcount_mismatch'] = { 'gdelt': None, 'scraper_regex': regex_wordcount } if DEBUG: print '^' * 20 print 'NONCRITICAL: Metadata missing value for "wordcount" so no sanity-check possible. ({})'.format( url) print '^' * 20 elif abs(regex_wordcount - metadict['wordcount']) / float( metadict['wordcount']) > 0.05: noncritical_warnings['wordcount_mismatch'] = { 'gdelt': metadict['wordcount'], 'scraper_regex': regex_wordcount } if DEBUG: print '^' * 20 print 'NONCRITICAL: GDELT reports {} words but scraped content contains approximately {} ({})'.format( metadict['wordcount'], regex_wordcount, url) print '^' * 20 # if we've made it this far w/o returning None, everything's good to go return { 'headline': headline, 'story_content': story_content, 'date': date, 'metadata': metadict, 'wordcount_as_scraped': regex_wordcount, 'warnings': noncritical_warnings }
def main(): """ Run ftfy as a command-line utility. """ import argparse parser = argparse.ArgumentParser( description="ftfy (fixes text for you), version %s" % __version__ ) parser.add_argument( "filename", default="-", nargs="?", help="The file whose Unicode is to be fixed. Defaults " "to -, meaning standard input.", ) parser.add_argument( "-o", "--output", type=str, default="-", help="The file to output to. Defaults to -, meaning " "standard output.", ) parser.add_argument( "-g", "--guess", action="store_true", help="Ask ftfy to guess the encoding of your input. " "This is risky. Overrides -e.", ) parser.add_argument( "-e", "--encoding", type=str, default="utf-8", help="The encoding of the input. Defaults to UTF-8.", ) parser.add_argument( "-n", "--normalization", type=str, default="NFC", help="The normalization of Unicode to apply. " 'Defaults to NFC. Can be "none".', ) parser.add_argument( "--preserve-entities", action="store_true", help="Leave HTML entities as they are. The default " "is to decode them, as long as no HTML tags " "have appeared in the file.", ) args = parser.parse_args() encoding = args.encoding if args.guess: encoding = None if args.filename == "-": # Get a standard input stream made of bytes, so we can decode it as # whatever encoding is necessary. file = sys.stdin.buffer else: file = open(args.filename, "rb") if args.output == "-": outfile = sys.stdout else: if os.path.realpath(args.output) == os.path.realpath(args.filename): sys.stderr.write(SAME_FILE_ERROR_TEXT) sys.exit(1) outfile = open(args.output, "w", encoding="utf-8") normalization = args.normalization if normalization.lower() == "none": normalization = None if args.preserve_entities: unescape_html = False else: unescape_html = "auto" config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization) try: for line in fix_file(file, encoding=encoding, config=config): try: outfile.write(line) except UnicodeEncodeError: if sys.platform == "win32": sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS) else: sys.stderr.write(ENCODE_ERROR_TEXT_UNIX) sys.exit(1) except UnicodeDecodeError as err: sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err)) sys.exit(1)
""" This file uses ftfy to fix broken unicode in Python files, strips punctuation, removes numbers. """ import ftfy import sys import codecs for line in ftfy.fix_file( codecs.open('new-latin1.txt', errors='ignore', encoding='latin1')): if line.strip().startswith('['): continue line = line.replace(",", "").replace("?", "").replace(".", "").replace( "!", "").replace(";", "").replace("-", "").replace("'", " ").lower() line = ''.join([i for i in line if not i.isdigit()]) #remove numbers line = line.strip() if line == '': continue print(line.replace("\n", ""))