Esempio n. 1
0
def test_russian_crash():
    thebytes = b'\xe8\xed\xe2\xe5\xed\xf2\xe0\xf0\xe8\xe7\xe0\xf6\xe8\xff '
    # We don't care what the result is, but this shouldn't crash
    thebytes.decode('utf-8-variants', 'replace') 
    
    # This shouldn't crash either
    guess_bytes(thebytes)
Esempio n. 2
0
def TestInput(data):
    fdp = atheris.FuzzedDataProvider(data)

    try:
        ftfy.fix_text(fdp.ConsumeString(1000))
        ftfy.fix_text(fdp.ConsumeUnicode(1000))

        plan1 = ftfy.fix_and_explain(fdp.ConsumeString(1000))[1]
        plan2 = ftfy.fix_and_explain(fdp.ConsumeUnicode(1000))[1]
        ftfy.apply_plan(fdp.ConsumeString(1000), plan1)
        ftfy.apply_plan(fdp.ConsumeString(1000), plan2)
        ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan1)
        ftfy.apply_plan(fdp.ConsumeUnicode(1000), plan2)

        ftfy.fix_text_segment(fdp.ConsumeString(1000))
        ftfy.fix_text_segment(fdp.ConsumeUnicode(1000))

        f = open("temp.txt", "w")
        f.write(fdp.ConsumeString(1000))
        f.write(fdp.ConsumeUnicode(1000))
        f.close()
        f = open("temp.txt", "r")
        ftfy.fix_file(f)
        f.close()

        ftfy.guess_bytes(fdp.ConsumeBytes(1000))
    except UnicodeError as e:
        if "Hey wait, this isn't Unicode." not in str(e):
            raise e
Esempio n. 3
0
def test_russian_crash():
    thebytes = b'\xe8\xed\xe2\xe5\xed\xf2\xe0\xf0\xe8\xe7\xe0\xf6\xe8\xff '
    # We don't care what the result is, but this shouldn't crash
    thebytes.decode('utf-8-variants', 'replace')

    # This shouldn't crash either
    guess_bytes(thebytes)
Esempio n. 4
0
def test_guess_bytes(string):
    for encoding in TEST_ENCODINGS:
        result_str, result_encoding = guess_bytes(string.encode(encoding))
        assert result_str == string
        assert result_encoding == encoding

    if '\n' in string:
        old_mac_bytes = string.replace('\n', '\r').encode('macroman')
        result_str, result_encoding = guess_bytes(old_mac_bytes)
        assert result_str == string.replace('\n', '\r')
Esempio n. 5
0
def check_bytes_decoding(string):
    for encoding in TEST_ENCODINGS:
        result_str, result_encoding = guess_bytes(string.encode(encoding))
        eq_(result_str, string)
        eq_(result_encoding, encoding)

    if '\n' in string:
        old_mac_bytes = string.replace('\n', '\r').encode('macroman')
        result_str, result_encoding = guess_bytes(old_mac_bytes)
        eq_(result_str, string.replace('\n', '\r'))
Esempio n. 6
0
def test_guess_bytes(string):
    for encoding in TEST_ENCODINGS:
        result_str, result_encoding = guess_bytes(string.encode(encoding))
        assert result_str == string
        assert result_encoding == encoding

    if '\n' in string:
        old_mac_bytes = string.replace('\n', '\r').encode('macroman')
        result_str, result_encoding = guess_bytes(old_mac_bytes)
        assert result_str == string.replace('\n', '\r')
Esempio n. 7
0
def check_bytes_decoding(string):
    for encoding in TEST_ENCODINGS:
        result_str, result_encoding = guess_bytes(string.encode(encoding))
        eq_(result_str, string)
        eq_(result_encoding, encoding)

    if '\n' in string:
        old_mac_bytes = string.replace('\n', '\r').encode('macroman')
        result_str, result_encoding = guess_bytes(old_mac_bytes)
        eq_(result_str, string.replace('\n', '\r'))
Esempio n. 8
0
    def files(self):
        """Files in torrent.

        List of namedtuples (filepath, size).

        :rtype: list[TorrentFile]
        """
        files = []
        info = self._struct.get('info')

        if not info:
            return files

        if 'files' in info:
            base = info['name']

            for f in info['files']:
                try:
                    files.append(
                        TorrentFile(join(base, *f['path']), f['length']))
                except TypeError:
                    fpath = [*f['path']]
                    npath = list()
                    for fp in fpath:
                        if isinstance(fp, bytes):
                            gp = ftfy.guess_bytes(fp)
                            fp = fp.decode(gp[1])
                        npath.append(fp)
                    files.append(TorrentFile(join(base, *npath), f['length']))

        else:
            files.append(TorrentFile(info['name'], info['length']))

        return files
Esempio n. 9
0
    def load(self, path_to_file):
        """Loads .txt file from `path_to_file`.

        Arguments:
            path_to_file (pathlib.Path):
                Path to .txt file

        Returns:
            doc (chomskIE.utils.Document)
                Document object corresponding to .txt file in `path_to_file`.
        """
        if not self._validate_data_path(path_to_file, is_directory=False):
            raise PathError(f'{path_to_file} is not a valid file path.')

        try:
            text_obj = open(path_to_file, 'r')
            text = text_obj.read()
        except UnicodeDecodeError:
            text_obj = open(path_to_file, 'rb')
            text, _ = ftfy.guess_bytes(text_obj.read())

        text = ftfy.ftfy(text)
        name = str(path_to_file).split('/')[-1]
        paragraphs = [p.strip() for p in text.splitlines() if p]

        doc = Document(name=name, text=text, paragraphs=paragraphs)
        return doc
Esempio n. 10
0
    def load(self, english_model, path_to_file):
        """Loads .txt file from `path_to_file`.

        Arguments:
            english_model (spacy.lang)
                Trained SpaCy language pipeline.)
            path_to_file (pathlib.Path):
                Path to .txt file

        Returns:
            doc, spacy_doc (tuple)
                ``doc`` is a ``chomskIE.utils.Document`` object corresponding
                to .txt file in `path`.

                ``spacy_doc`` is a ``spacy.tokens.Document`` object corresponding
                to .txt files in `path` processed by ``english_model``.
        """
        if not self._validate_data_path(path_to_file, is_directory=False):
            raise PathError(f'{path_to_file} is not a valid file path.')

        try:
            text_obj = open(path_to_file, 'r')
            text = text_obj.read()
        except UnicodeDecodeError:
            text_obj = open(path_to_file, 'rb')
            text, _ = ftfy.guess_bytes(text_obj.read())

        text = ftfy.ftfy(text)
        name = str(path_to_file).split('/')[-1]

        spacy_doc = english_model(text)
        doc = Document(name=name, text=None, paragraphs=None)

        return doc, spacy_doc
Esempio n. 11
0
def test_guess_bytes():
    for string in TEST_STRINGS:
        yield check_bytes_decoding, string

    bowdlerized_null = b'null\xc0\x80separated'
    result_str, result_encoding = guess_bytes(bowdlerized_null)
    eq_(result_str, u'null\x00separated')
    eq_(result_encoding, u'utf-8-variants')
Esempio n. 12
0
def test_guess_bytes():
    for string in TEST_STRINGS:
        yield check_bytes_decoding, string

    bowdlerized_null = b'null\xc0\x80separated'
    result_str, result_encoding = guess_bytes(bowdlerized_null)
    eq_(result_str, u'null\x00separated')
    eq_(result_encoding, u'utf-8-variants')
    def parse(self, raw):
        parser = regex.compile('<.*?>')
        data = regex.sub(parser, '', raw)
        data = data.replace('\n', '')
        encoding = fx.guess_bytes(data)[1]

        if encoding == 'utf-8':
            decode = data.decode('utf-8')
            data = fx.fix_text(decode)

        return data
Esempio n. 14
0
def detect_file_encoding(filename):
    """
    Use ftfy to detect the encoding of a file, based on a sample of its
    first megabyte.

    ftfy's encoding detector is limited. The only encodings it can detect are
    UTF-8, CESU-8, UTF-16, Windows-1252, and occasionally MacRoman. But it
    does much better than chardet.
    """
    with open(filename, 'rb') as opened:
        sample = opened.read(2**20)
        _, encoding = ftfy.guess_bytes(sample)
        return encoding
def detect_file_encoding(filename):
    """
    Use ftfy to detect the encoding of a file, based on a sample of its
    first megabyte.

    ftfy's encoding detector is limited. The only encodings it can detect are
    UTF-8, CESU-8, UTF-16, Windows-1252, and occasionally MacRoman. But it
    does much better than chardet.
    """
    with open(filename, 'rb') as opened:
        sample = opened.read(2 ** 20)
        _, encoding = ftfy.guess_bytes(sample)
        return encoding
Esempio n. 16
0
def test_guess_bytes_null():
    bowdlerized_null = b'null\xc0\x80separated'
    result_str, result_encoding = guess_bytes(bowdlerized_null)
    assert result_str == 'null\x00separated'
    assert result_encoding == 'utf-8-variants'
Esempio n. 17
0
def import_from_skforum():
    cursor = connection.cursor()
    # for table in ['forum_message', 'forum_room', 'unread_usertime', 'unread_systemtime']:
    #     cursor.execute(f'truncate {table}')

    from ftfy import fix_text, guess_bytes

    cursor.execute('select CONTEXT, DOCKEY, VERSION, CONTENT from forum.documentversioncontent2')

    for context, dockey, version, content in cursor.fetchall():
        try:
            # if name in an int, then it's a personal wiki for the user of that pk
            int(context)
            context = Context.objects.get_or_create(name=context, custom_data=f'user/{context}')[0]
        except ValueError:
            context = Context.objects.get_or_create(name=context)[0]

        document = Document.objects.get_or_create(context=context, name=dockey)[0]
        DocumentVersion.objects.get_or_create(document=document, version=int(version), content=fix_text(guess_bytes(content)[0]))

    for context in Context.objects.all():
        try:
            int(context.name)
            context.name = 'Private wiki'
            context.save()
        except ValueError:
            pass
Esempio n. 18
0
 def cleanup_command_result(cls, result):
     return ftfy.fix_text(ftfy.guess_bytes(result)[0])
Esempio n. 19
0
def fix_text_udf(binary_column: pd.Series) -> pd.Series:
    return binary_column.apply(lambda b: fix_text(guess_bytes(b)[0]))
# utf-16 gives error: UnicodeError: UTF-16 stream does not start with BOM
# macRoman macGreek macturkish maclatin2
# latin-1 latin2 - latin10   nb  iso-8859-1 == latin-1  iso-8859-5 to 8
# UTF-16LE UTF-16BE utf_32_le utf_32_be
# ISO-8859-7
# cp500 cp737 cp850 cp852 cp855 cp857 cp858 cp869 cp875 cp1026 cp1140
# greek == iso-8859-7
# ascii (lol)
#

import ftfy


rawdata = open(dir + file, 'rb').read()
result = charade.detect(rawdata)
print ftfy.guess_bytes(rawdata)[0]
print rawdata
print result
'''


with codecs.open(dir + file, mode='r', encoding='utf-8') as infile:
#with io.open(dir + file, mode='rb') as infile:
#    data = infile.read().encode('windows-1250')
        #.decode('latin1')

    #print data
    for line in infile:

        #line = line.replace(u'ˆ', u'à')
        #line = line.replace(u'Õ', u"'")
Esempio n. 21
0
def test_guess_bytes_null():
    bowdlerized_null = b'null\xc0\x80separated'
    result_str, result_encoding = guess_bytes(bowdlerized_null)
    assert result_str == 'null\x00separated'
    assert result_encoding == 'utf-8-variants'
Esempio n. 22
0
def main():
    args = docopt.docopt(__doc__)
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    COLOR = GREEN if args['-g'] else BLUE


    nagents = int(args['-n'])
    if nagents <= 0:
        print 'Number of agents should be positive'
        exit()


    # We will take many samples in an attempt to reduce number of keys to farm
    # This is the number of samples to take since the last improvement
    EXTRA_SAMPLES = int(args['-s'])
    if EXTRA_SAMPLES not in range(1, 101):
        print 'Number of extra samples must be between 1 and 100'
        exit()


    input_file = args['<input_file>']
    name, ext = os.path.splitext(os.path.basename(input_file))

    try:
        os.makedirs(name)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    output_directory = name + os.sep
    output_file = name + '_' + timestamp + '.pkl'

    if ext != '.pkl':
        a = nx.DiGraph()
        np = geometry.np

        locs = []

        # each line should be name,intel_link,keys
        with open(input_file) as fin:
            text, encoding = guess_bytes(fin.read())
            rows = unicodecsv.reader(text.encode('utf-8').strip().split('\n'), encoding='utf-8')
            for i, row in enumerate(rows):
                a.add_node(i)
                a.node[i]['name'] = row[0]

                url = ','.join(row[1:4]).strip()
                if not url.startswith('http'):
                    print 'Unable to parse input file. Did you forget to put quotes around a name containing a comma?'
                    exit()

                coords = urlparse.parse_qs(urlparse.urlparse(url).query)['pll'][0] # this could have been done the quick and dirty way, but I chose this way. It just felt right
                coord_parts = coords.split(',')
                lat = int(float(coord_parts[0]) * 1.e6)
                lon = int(float(coord_parts[1]) * 1.e6)
                locs.append(np.array([lat, lon], dtype=int)) # why does this have to be a numpy array?
                
                if '.' in row[-1]:
                    a.node[i]['keys'] = 0
                else:
                    a.node[i]['keys'] = int(row[-1])

        n = a.order() # number of nodes
        if n > 65:
            print 'Limit of 65 portals may be optimized at once'
            exit()

        locs = np.array(locs, dtype=float)

        # This part assumes we're working with E6 latitude-longitude data
        locs = geometry.e6LLtoRads(locs)
        xyz  = geometry.radstoxyz(locs)
        xy   = geometry.gnomonicProj(locs,xyz)

        for i in xrange(n):
            a.node[i]['geo'] = locs[i]
            a.node[i]['xyz'] = xyz [i]
            a.node[i]['xy' ] = xy  [i]

        # EXTRA_SAMPLES attempts to get graph with few missing keys
        # Try to minimuze TK + 2*MK where
        #   TK is the total number of missing keys
        #   MK is the maximum number of missing keys for any single portal
        bestgraph = None
        bestlack = np.inf
        bestTK = np.inf
        bestMK = np.inf

        allTK = []
        allMK = []
        allWeights = []

        sinceImprove = 0

        while sinceImprove < EXTRA_SAMPLES:
            b = a.copy()

            sinceImprove += 1

            if not maxfield.maxFields(b):
                print 'Randomization failure\n\tThe program may work if you try again. It is more likely to work if you remove some portals.'
                continue

            TK = 0
            MK = 0
            for j in xrange(n):
                keylack = max(b.in_degree(j)-b.node[j]['keys'],0)
                TK += keylack
                if keylack > MK:
                    MK = keylack

            weightedlack = TK+2*MK

            allTK.append(TK)
            allMK.append(MK)
            allWeights.append(weightedlack)

            if weightedlack < bestlack:
                sinceImprove = 0
                print 'IMPROVEMENT:\ttotal: {}\tmax: {}\tweighted: {}\t{} tries since improvement'.format(TK, MK, weightedlack, sinceImprove)
                bestgraph = b
                bestlack = weightedlack
                bestTK  = TK
                bestMK  = MK
            else:
                print 'this time:\ttotal: {}\tmax: {}\tweighted: {}\t{} tries since improvement'.format(TK, MK, weightedlack, sinceImprove)

            if weightedlack <= 0:
                print 'KEY PERFECTION'
                bestlack = weightedlack
                bestTK  = TK
                bestMK  = MK
                break
            # if num agent keys is zero, this code isn't true...
            # if all([ b.node[i]['keys'] <= b.out_degree(i) for i in xrange(n) ]):
            #     print 'All keys used. Improvement impossible'
            #     break

        if bestgraph == None:
            print 'EXITING RANDOMIZATION LOOP WITHOUT SOLUTION!'
            print ''
            exit()

        print 'Choosing plan requiring {} additional keys, max of {} from single portal'.format(bestTK, bestMK)

        plt.clf()
        plt.scatter(allTK,allMK,c=allWeights,marker='o')
        plt.xlim(min(allTK)-1,max(allTK)+1)
        plt.ylim(min(allMK)-1,max(allMK)+1)
        plt.xlabel('Total keys required')
        plt.ylabel('Max keys required for a single portal')
        cbar = plt.colorbar()
        cbar.set_label('Optimization Weighting (lower=better)')
        plt.savefig(output_directory+'optimization.png')

        a = bestgraph # remember: a = nx.DiGraph()

        # Attach to each edge a list of fields that it completes
        for t in a.triangulation:
            t.markEdgesWithFields()

        agentOrder.improveEdgeOrder(a)

        with open(output_directory+output_file,'w') as fout:
            pickle.dump(a, fout)
    else:
        with open(input_file,'r') as fin:
            a = pickle.load(fin)

    #    agentOrder.improveEdgeOrder(a)
    #    with open(output_directory+output_file,'w') as fout:
    #        pickle.dump(a,fout)

    PP = PlanPrinterMap.PlanPrinter(a, output_directory, nagents, COLOR)
    PP.keyPrep()
    PP.agentKeys()
    PP.planMap()
    PP.agentLinks()

    # These make step-by-step instructional images
    PP.animate()
    PP.split3instruct()

    print "Number of portals: {0}".format(PP.num_portals)
    print "Number of links: {0}".format(PP.num_links)
    print "Number of fields: {0}".format(PP.num_fields)
    portal_ap = (125*8 + 500 + 250)*PP.num_portals
    link_ap = 313 * PP.num_links
    field_ap = 1250 * PP.num_fields
    print "AP from portals capture: {0}".format(portal_ap)
    print "AP from link creation: {0}".format(link_ap)
    print "AP from field creation: {0}".format(field_ap)
    print "Total AP: {0}".format(portal_ap+link_ap+field_ap)