def test_russian_crash(): thebytes = b'\xe8\xed\xe2\xe5\xed\xf2\xe0\xf0\xe8\xe7\xe0\xf6\xe8\xff ' # We don't care what the result is, but this shouldn't crash thebytes.decode('utf-8-variants', 'replace') # This shouldn't crash either guess_bytes(thebytes)
def TestInput(data): fdp = atheris.FuzzedDataProvider(data) try: ftfy.fix_text(fdp.ConsumeString(1000)) ftfy.fix_text(fdp.ConsumeUnicode(1000)) plan1 = ftfy.fix_and_explain(fdp.ConsumeString(1000))[1] plan2 = ftfy.fix_and_explain(fdp.ConsumeUnicode(1000))[1] ftfy.apply_plan(fdp.ConsumeString(1000), plan1) ftfy.apply_plan(fdp.ConsumeString(1000), plan2) ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan1) ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan2) ftfy.fix_text_segment(fdp.ConsumeString(1000)) ftfy.fix_text_segment(fdp.ConsumeUnicode(1000)) f = open("temp.txt", "w") f.write(fdp.ConsumeString(1000)) f.write(fdp.ConsumeUnicode(1000)) f.close() f = open("temp.txt", "r") ftfy.fix_file(f) f.close() ftfy.guess_bytes(fdp.ConsumeBytes(1000)) except UnicodeError as e: if "Hey wait, this isn't Unicode." not in str(e): raise e
def test_guess_bytes(string): for encoding in TEST_ENCODINGS: result_str, result_encoding = guess_bytes(string.encode(encoding)) assert result_str == string assert result_encoding == encoding if '\n' in string: old_mac_bytes = string.replace('\n', '\r').encode('macroman') result_str, result_encoding = guess_bytes(old_mac_bytes) assert result_str == string.replace('\n', '\r')
def check_bytes_decoding(string): for encoding in TEST_ENCODINGS: result_str, result_encoding = guess_bytes(string.encode(encoding)) eq_(result_str, string) eq_(result_encoding, encoding) if '\n' in string: old_mac_bytes = string.replace('\n', '\r').encode('macroman') result_str, result_encoding = guess_bytes(old_mac_bytes) eq_(result_str, string.replace('\n', '\r'))
def files(self): """Files in torrent. List of namedtuples (filepath, size). :rtype: list[TorrentFile] """ files = [] info = self._struct.get('info') if not info: return files if 'files' in info: base = info['name'] for f in info['files']: try: files.append( TorrentFile(join(base, *f['path']), f['length'])) except TypeError: fpath = [*f['path']] npath = list() for fp in fpath: if isinstance(fp, bytes): gp = ftfy.guess_bytes(fp) fp = fp.decode(gp[1]) npath.append(fp) files.append(TorrentFile(join(base, *npath), f['length'])) else: files.append(TorrentFile(info['name'], info['length'])) return files
def load(self, path_to_file): """Loads .txt file from `path_to_file`. Arguments: path_to_file (pathlib.Path): Path to .txt file Returns: doc (chomskIE.utils.Document) Document object corresponding to .txt file in `path_to_file`. """ if not self._validate_data_path(path_to_file, is_directory=False): raise PathError(f'{path_to_file} is not a valid file path.') try: text_obj = open(path_to_file, 'r') text = text_obj.read() except UnicodeDecodeError: text_obj = open(path_to_file, 'rb') text, _ = ftfy.guess_bytes(text_obj.read()) text = ftfy.ftfy(text) name = str(path_to_file).split('/')[-1] paragraphs = [p.strip() for p in text.splitlines() if p] doc = Document(name=name, text=text, paragraphs=paragraphs) return doc
def load(self, english_model, path_to_file): """Loads .txt file from `path_to_file`. Arguments: english_model (spacy.lang) Trained SpaCy language pipeline.) path_to_file (pathlib.Path): Path to .txt file Returns: doc, spacy_doc (tuple) ``doc`` is a ``chomskIE.utils.Document`` object corresponding to .txt file in `path`. ``spacy_doc`` is a ``spacy.tokens.Document`` object corresponding to .txt files in `path` processed by ``english_model``. """ if not self._validate_data_path(path_to_file, is_directory=False): raise PathError(f'{path_to_file} is not a valid file path.') try: text_obj = open(path_to_file, 'r') text = text_obj.read() except UnicodeDecodeError: text_obj = open(path_to_file, 'rb') text, _ = ftfy.guess_bytes(text_obj.read()) text = ftfy.ftfy(text) name = str(path_to_file).split('/')[-1] spacy_doc = english_model(text) doc = Document(name=name, text=None, paragraphs=None) return doc, spacy_doc
def test_guess_bytes(): for string in TEST_STRINGS: yield check_bytes_decoding, string bowdlerized_null = b'null\xc0\x80separated' result_str, result_encoding = guess_bytes(bowdlerized_null) eq_(result_str, u'null\x00separated') eq_(result_encoding, u'utf-8-variants')
def parse(self, raw): parser = regex.compile('<.*?>') data = regex.sub(parser, '', raw) data = data.replace('\n', '') encoding = fx.guess_bytes(data)[1] if encoding == 'utf-8': decode = data.decode('utf-8') data = fx.fix_text(decode) return data
def detect_file_encoding(filename): """ Use ftfy to detect the encoding of a file, based on a sample of its first megabyte. ftfy's encoding detector is limited. The only encodings it can detect are UTF-8, CESU-8, UTF-16, Windows-1252, and occasionally MacRoman. But it does much better than chardet. """ with open(filename, 'rb') as opened: sample = opened.read(2**20) _, encoding = ftfy.guess_bytes(sample) return encoding
def detect_file_encoding(filename): """ Use ftfy to detect the encoding of a file, based on a sample of its first megabyte. ftfy's encoding detector is limited. The only encodings it can detect are UTF-8, CESU-8, UTF-16, Windows-1252, and occasionally MacRoman. But it does much better than chardet. """ with open(filename, 'rb') as opened: sample = opened.read(2 ** 20) _, encoding = ftfy.guess_bytes(sample) return encoding
def test_guess_bytes_null(): bowdlerized_null = b'null\xc0\x80separated' result_str, result_encoding = guess_bytes(bowdlerized_null) assert result_str == 'null\x00separated' assert result_encoding == 'utf-8-variants'
def import_from_skforum(): cursor = connection.cursor() # for table in ['forum_message', 'forum_room', 'unread_usertime', 'unread_systemtime']: # cursor.execute(f'truncate {table}') from ftfy import fix_text, guess_bytes cursor.execute('select CONTEXT, DOCKEY, VERSION, CONTENT from forum.documentversioncontent2') for context, dockey, version, content in cursor.fetchall(): try: # if name in an int, then it's a personal wiki for the user of that pk int(context) context = Context.objects.get_or_create(name=context, custom_data=f'user/{context}')[0] except ValueError: context = Context.objects.get_or_create(name=context)[0] document = Document.objects.get_or_create(context=context, name=dockey)[0] DocumentVersion.objects.get_or_create(document=document, version=int(version), content=fix_text(guess_bytes(content)[0])) for context in Context.objects.all(): try: int(context.name) context.name = 'Private wiki' context.save() except ValueError: pass
def cleanup_command_result(cls, result): return ftfy.fix_text(ftfy.guess_bytes(result)[0])
def fix_text_udf(binary_column: pd.Series) -> pd.Series: return binary_column.apply(lambda b: fix_text(guess_bytes(b)[0]))
# utf-16 gives error: UnicodeError: UTF-16 stream does not start with BOM # macRoman macGreek macturkish maclatin2 # latin-1 latin2 - latin10 nb iso-8859-1 == latin-1 iso-8859-5 to 8 # UTF-16LE UTF-16BE utf_32_le utf_32_be # ISO-8859-7 # cp500 cp737 cp850 cp852 cp855 cp857 cp858 cp869 cp875 cp1026 cp1140 # greek == iso-8859-7 # ascii (lol) # import ftfy rawdata = open(dir + file, 'rb').read() result = charade.detect(rawdata) print ftfy.guess_bytes(rawdata)[0] print rawdata print result ''' with codecs.open(dir + file, mode='r', encoding='utf-8') as infile: #with io.open(dir + file, mode='rb') as infile: # data = infile.read().encode('windows-1250') #.decode('latin1') #print data for line in infile: #line = line.replace(u'ˆ', u'à') #line = line.replace(u'Õ', u"'")
def main(): args = docopt.docopt(__doc__) timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") COLOR = GREEN if args['-g'] else BLUE nagents = int(args['-n']) if nagents <= 0: print 'Number of agents should be positive' exit() # We will take many samples in an attempt to reduce number of keys to farm # This is the number of samples to take since the last improvement EXTRA_SAMPLES = int(args['-s']) if EXTRA_SAMPLES not in range(1, 101): print 'Number of extra samples must be between 1 and 100' exit() input_file = args['<input_file>'] name, ext = os.path.splitext(os.path.basename(input_file)) try: os.makedirs(name) except OSError as exception: if exception.errno != errno.EEXIST: raise output_directory = name + os.sep output_file = name + '_' + timestamp + '.pkl' if ext != '.pkl': a = nx.DiGraph() np = geometry.np locs = [] # each line should be name,intel_link,keys with open(input_file) as fin: text, encoding = guess_bytes(fin.read()) rows = unicodecsv.reader(text.encode('utf-8').strip().split('\n'), encoding='utf-8') for i, row in enumerate(rows): a.add_node(i) a.node[i]['name'] = row[0] url = ','.join(row[1:4]).strip() if not url.startswith('http'): print 'Unable to parse input file. Did you forget to put quotes around a name containing a comma?' exit() coords = urlparse.parse_qs(urlparse.urlparse(url).query)['pll'][0] # this could have been done the quick and dirty way, but I chose this way. It just felt right coord_parts = coords.split(',') lat = int(float(coord_parts[0]) * 1.e6) lon = int(float(coord_parts[1]) * 1.e6) locs.append(np.array([lat, lon], dtype=int)) # why does this have to be a numpy array? if '.' in row[-1]: a.node[i]['keys'] = 0 else: a.node[i]['keys'] = int(row[-1]) n = a.order() # number of nodes if n > 65: print 'Limit of 65 portals may be optimized at once' exit() locs = np.array(locs, dtype=float) # This part assumes we're working with E6 latitude-longitude data locs = geometry.e6LLtoRads(locs) xyz = geometry.radstoxyz(locs) xy = geometry.gnomonicProj(locs,xyz) for i in xrange(n): a.node[i]['geo'] = locs[i] a.node[i]['xyz'] = xyz [i] a.node[i]['xy' ] = xy [i] # EXTRA_SAMPLES attempts to get graph with few missing keys # Try to minimuze TK + 2*MK where # TK is the total number of missing keys # MK is the maximum number of missing keys for any single portal bestgraph = None bestlack = np.inf bestTK = np.inf bestMK = np.inf allTK = [] allMK = [] allWeights = [] sinceImprove = 0 while sinceImprove < EXTRA_SAMPLES: b = a.copy() sinceImprove += 1 if not maxfield.maxFields(b): print 'Randomization failure\n\tThe program may work if you try again. It is more likely to work if you remove some portals.' continue TK = 0 MK = 0 for j in xrange(n): keylack = max(b.in_degree(j)-b.node[j]['keys'],0) TK += keylack if keylack > MK: MK = keylack weightedlack = TK+2*MK allTK.append(TK) allMK.append(MK) allWeights.append(weightedlack) if weightedlack < bestlack: sinceImprove = 0 print 'IMPROVEMENT:\ttotal: {}\tmax: {}\tweighted: {}\t{} tries since improvement'.format(TK, MK, weightedlack, sinceImprove) bestgraph = b bestlack = weightedlack bestTK = TK bestMK = MK else: print 'this time:\ttotal: {}\tmax: {}\tweighted: {}\t{} tries since improvement'.format(TK, MK, weightedlack, sinceImprove) if weightedlack <= 0: print 'KEY PERFECTION' bestlack = weightedlack bestTK = TK bestMK = MK break # if num agent keys is zero, this code isn't true... # if all([ b.node[i]['keys'] <= b.out_degree(i) for i in xrange(n) ]): # print 'All keys used. Improvement impossible' # break if bestgraph == None: print 'EXITING RANDOMIZATION LOOP WITHOUT SOLUTION!' print '' exit() print 'Choosing plan requiring {} additional keys, max of {} from single portal'.format(bestTK, bestMK) plt.clf() plt.scatter(allTK,allMK,c=allWeights,marker='o') plt.xlim(min(allTK)-1,max(allTK)+1) plt.ylim(min(allMK)-1,max(allMK)+1) plt.xlabel('Total keys required') plt.ylabel('Max keys required for a single portal') cbar = plt.colorbar() cbar.set_label('Optimization Weighting (lower=better)') plt.savefig(output_directory+'optimization.png') a = bestgraph # remember: a = nx.DiGraph() # Attach to each edge a list of fields that it completes for t in a.triangulation: t.markEdgesWithFields() agentOrder.improveEdgeOrder(a) with open(output_directory+output_file,'w') as fout: pickle.dump(a, fout) else: with open(input_file,'r') as fin: a = pickle.load(fin) # agentOrder.improveEdgeOrder(a) # with open(output_directory+output_file,'w') as fout: # pickle.dump(a,fout) PP = PlanPrinterMap.PlanPrinter(a, output_directory, nagents, COLOR) PP.keyPrep() PP.agentKeys() PP.planMap() PP.agentLinks() # These make step-by-step instructional images PP.animate() PP.split3instruct() print "Number of portals: {0}".format(PP.num_portals) print "Number of links: {0}".format(PP.num_links) print "Number of fields: {0}".format(PP.num_fields) portal_ap = (125*8 + 500 + 250)*PP.num_portals link_ap = 313 * PP.num_links field_ap = 1250 * PP.num_fields print "AP from portals capture: {0}".format(portal_ap) print "AP from link creation: {0}".format(link_ap) print "AP from field creation: {0}".format(field_ap) print "Total AP: {0}".format(portal_ap+link_ap+field_ap)