Exemple #1
0
    '''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese.
       Ignores everything else. E.g.:

       >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS
       True
       >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
       True'''
    def __init__(self, ngram):
        base.Tzer.__init__(self, ngram)
        self.tiny = tiny.Tzer(ngram)
        self.icu = ICU(ngram)

    def tokenize_real(self, text):
        ws_tokens = text.split()
        tokens = []
        for ws_token in ws_tokens:
            if (is_latin(ws_token)):
                tokens.extend(self.icu.tokenize(ws_token))
            elif (is_japanese(ws_token)):
                tokens.extend(self.tiny.tokenize(ws_token))
        return tokens


# Test-Depends: manual icu
testable.register('''

>>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
True

''')
Exemple #2
0
    def table_exists_p(self, table):
        return (1 == len(
            self.sql(
                """SELECT 1 FROM sqlite_master
                                   WHERE name = ?""", (table, ))))

    def table_ct(self):
        'Return the number of tables in the database.'
        return self.sql("SELECT count(*) FROM sqlite_master")[0][0]


testable.register('''

   # FIXME: the kludge to silence SpatiaLite fails with AttributeError when
   # run under doctest. Therefore, we test without SpatiaLite for now.

   # Initialize an in-memory database
   >>> db = DB(':memory:', create=True, spatialite=False)
   >>> db.is_empty()
   True
   >>> db.create_table('foo', { 'a': 'int' })

   # Does table_ct() work?
   >>> db.table_ct()
   1
   >>> db.is_empty()
   False

''')
Exemple #3
0
                   for gmm in self.all_gmms])
        if self.verbose:
            for (fv, fi) in self.feature_alphabet.iteritems():
                l.debug('feature weight %s=%g' % (fv, res.x[fi]))
            for (t, w) in di.iteritems():
                l.debug('token weight %s=%s' % (t, str(w)))
        # clean up
        for g in self.all_gmms:
            g.feature_vector = None
        return di


# test that self.all_gmms has stable order
testable.register('''
>>> import gmm
>>> import random
>>> def test_random():
...   u.rand = random.Random(123)
...   gmm.Token.parms_init({})
...   mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326)
...   m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a')
...   m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b')
...   m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c')
...   m = Weight([[m1, m2], [m2, m3], [m1, m3]],
...            [[100, 50], [50, 200], [80, 400]], identity_feature=True,
...            misc_feature=False)
...   return list(m.all_gmms)
>>> all((test_random()[0].tokens == test_random()[0].tokens for i in xrange(100)))
True
''')
Exemple #4
0
    'tweet_id': -1,
    'created_at': datetime.now(),
    'text': 'a b',
    'user_screen_name': 'c',
    'user_description': 'd',
    'user_lang': 'e',
    'user_location': 'f',
    'user_time_zone': 'g',
    'geom': None,
    'geom_src': None
})
T_TW_JSON_CO = r'''{"text":"Guantes, bufanda, tenis y chamarra :) #Viena","id_str":"186339941163339776","contributors":null,"in_reply_to_status_id_str":null,"geo":{"type":"Point","coordinates":[48.24424304,16.37778864]},"retweet_count":0,"in_reply_to_status_id":null,"favorited":false,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twitter.com\/#!\/download\/iphone\" rel=\"nofollow\"\u003ETwitter for iPhone\u003C\/a\u003E","created_at":"Sun Apr 01 06:31:18 +0000 2012","in_reply_to_user_id_str":null,"truncated":false,"entities":{"urls":[],"hashtags":[{"text":"Viena","indices":[38,44]}],"user_mentions":[]},"coordinates":{"type":"Point","coordinates":[16.37778864,48.24424304]},"place":{"country":"Austria","place_type":"city","url":"http:\/\/api.twitter.com\/1\/geo\/id\/9f659d51e5c5deae.json","country_code":"AT","bounding_box":{"type":"Polygon","coordinates":[[[16.182302,48.117666],[16.577511,48.117666],[16.577511,48.322574],[16.182302,48.322574]]]},"attributes":{},"full_name":"Vienna, Vienna","name":"Vienna","id":"9f659d51e5c5deae"},"in_reply_to_screen_name":null,"user":{"profile_background_color":"8B542B","id_str":"249409866","profile_background_tile":true,"screen_name":"montse_moso","listed_count":3,"time_zone":"Mexico City","profile_sidebar_fill_color":"ffffff","description":"you  It's exhausting being this Juicy \u2764","default_profile":false,"profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","created_at":"Wed Feb 09 00:21:15 +0000 2011","profile_sidebar_border_color":"f03368","is_translator":false,"contributors_enabled":false,"geo_enabled":true,"url":null,"profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2003516916\/image_normal.jpg","follow_request_sent":null,"profile_use_background_image":true,"lang":"es","verified":false,"profile_text_color":"333333","protected":false,"default_profile_image":false,"show_all_inline_media":false,"notifications":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","location":"","name":"Montse Alcaraz ","favourites_count":415,"profile_link_color":"9D582E","id":249409866,"statuses_count":5252,"following":null,"utc_offset":-21600,"friends_count":368,"followers_count":191,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2003516916\/image_normal.jpg"},"retweeted":false,"id":186339941163339776}'''
# FIXME: add test tweets for the other geotag sources

testable.register('''

# Make sure we don't drop anything through all the parsing and unparsing.
>>> a = from_json(T_TW_JSON_CO)
>>> a.geom_src
'co'
>>> a.created_at
datetime.datetime(2012, 4, 1, 6, 31, 18, tzinfo=<UTC>)
>>> a.day
'2012-04-01'
>>> a == Tweet.from_list(a.to_list())
True
>>> a == Tweet.from_dict(a.to_dict())
True

''')
Exemple #5
0
testable.register('''

# Make sure random seed is set to a known value
>>> rand.random()
0.40224696110279223

# Memoized function fails with TypeError if passed an unhashable argument.
>>> @memoize
... def f(x):
...   return x*2
>>> f(dict())
Traceback (most recent call last):
  ...
TypeError: unhashable type: 'dict'

# Check that memoized reset() works by looking at exposed cache.
>>> f(1)
2
>>> f.cache
{(1,): 2}
>>> f.reset()
>>> f.cache
{}

# More slices. Basically, we want (almost) the same behavior as if we had
# typed the slice into the Python interpreter. The "and None" trick is simply
# to suppress output if the expression is true, so we don't have to keep
# typing "True".
>>> a = [0, 1, 2, 3, 4]
>>> (a[slp(':')] == a) and None
>>> (a[slp('0')] == [a[0]]) and None
>>> (a[slp('4')] == [a[4]]) and None
>>> a[slp('5')]
[]
>>> (a[slp('-1')] == [a[-1]]) and None
>>> (a[slp('-2')] == [a[-2]]) and None
>>> (a[slp('-5')] == [a[-5]]) and None
>>> a[slp('-6')]
[]
>>> (a[slp('1:')] == a[1:]) and None
>>> (a[slp(':1')] == a[:1]) and None
>>> (a[slp('-2:')] == a[-2:]) and None
>>> (a[slp(':-2')] == a[:-2]) and None
>>> (a[slp('1::')] == a[1::]) and None
>>> (a[slp('::1')] == a[::1]) and None
>>> (a[slp('2::')] == a[2::]) and None
>>> (a[slp('::2')] == a[::2]) and None
>>> (a[slp('-1::')] == a[-1::]) and None
>>> (a[slp('::-1')] == a[::-1]) and None

# More unioned slices
>>> pprint(sl_union(10))  # no slices
set()
>>> pprint(sl_union(0, slp('1')))  # empty list
set()
>>> pprint(sl_union(10, slp('1:4')))  # one slice
set([1, 2, 3])
>>> pprint(sl_union(10, slp('1:4'), slp('3')))  # overlapping slices
set([1, 2, 3])
>>> pprint(sl_union(10, slp('10')))  # fully out of bounds
set()
>>> pprint(sl_union(10, slp('9:11')))  # partly out of bounds
set([9])
>>> pprint(sl_union(10, slp('9'), slp('10')))  # one in, one out
set([9])

''')
Exemple #6
0
                  max(self.min_value, gmm.score))
                 for gmm in self.all_gmms])
      if self.verbose:
         for (fv,fi) in self.feature_alphabet.iteritems():
            l.debug('feature weight %s=%g' % (fv,res.x[fi]))
         for (t,w) in di.iteritems():
            l.debug('token weight %s=%s'%(t,str(w)))
      # clean up
      for g in self.all_gmms:
         g.feature_vector = None
      return di

# test that self.all_gmms has stable order
testable.register('''
>>> import gmm
>>> import random
>>> def test_random():
...   u.rand = random.Random(123)
...   gmm.Token.parms_init({})
...   mp = geos.MultiPoint(geos.Point(1,2), geos.Point(3,4), srid=4326)
...   m1 = gmm.Geo_GMM.from_fit(mp, 1, 'a')
...   m2 = gmm.Geo_GMM.from_fit(mp, 2, 'b')
...   m3 = gmm.Geo_GMM.from_fit(mp, 1, 'c')
...   m = Weight([[m1, m2], [m2, m3], [m1, m3]],
...            [[100, 50], [50, 200], [80, 400]], identity_feature=True,
...            misc_feature=False)
...   return list(m.all_gmms)
>>> all((test_random()[0].tokens == test_random()[0].tokens for i in xrange(100)))
True
''')
Exemple #7
0
testable.register('''

# Make sure the SRIDs we're interested in are available.
>>> for srid in (4326, 54003, 540033, 540036, 54009, 540093, 540096):
...   if not isinstance(SRS[srid], gdal.SpatialReference): srid

# Test that we can transform to and from the custom SRSes.
>>> a = geos.Point(1, 2, srid=SRID_WGS84)
>>> b = transform(a, 540036)
>>> a.srid
4326
>>> b.coords
(0.111..., 0.220...)
>>> b.srid
540036
>>> c = transform(b, 4326)
>>> c.srid
4326
>>> [round(x, 4) for x in c.coords]
[1.0, 2.0]

# geodesic_area() should except if we give it a bogus geometry type.
>>> geodesic_area(geos.Point(0,0))
Traceback (most recent call last):
  ...
TypeError: need Polygon or MultiPolygon, not Point

# inbounds_p() should work north/sound and on SRS that requires transform
>>> inbounds_p(geos.Point(0, 89.98, srid=SRID_WGS84))
True
>>> inbounds_p(geos.Point(0, 90.01, srid=SRID_WGS84))
False
>>> inbounds_p(geos.Point(0, -89.98, srid=SRID_WGS84))
True
>>> inbounds_p(geos.Point(0, -90.01, srid=SRID_WGS84))
False
>>> inbounds_p(geos.Point(0, 14671436.0, srid=54003))
True
>>> inbounds_p(geos.Point(0, 14671436.1, srid=54003))
False
>>> inbounds_p(geos.Point(0, -14671436.0, srid=54003))
True
>>> inbounds_p(geos.Point(0, -14671436.1, srid=54003))
False

# Ensure that trim() works on multipolygons.
>>> yo = 15e6
>>> yi = 14e6
>>> mp = geos.MultiPoint([geos.Point(0, yi), geos.Point(0, yo)], srid=54003)
>>> trim(mp).coords
(0.0, 14000000.0)

''')
Exemple #8
0
class Token_All_Pipeline(pipeline.Model):
   def __init__(self, token_iterator):
      assert False, 'unimplemented'
      pipeline.Model.__init__(self, [Token(token_iterator),
                                     All_Tweets(token_iterator)])


### Tests ###

# Test passes as of sklearn.13-git
testable.register('''

# Test that fitting respects consistent random state.
>>> def test_r():
...   r = np.random.mtrand.RandomState(1234)
...   m = sklearn.mixture.GMM(n_components=2, random_state=r)
...   m.fit([1, 1.1, 2, 2.2])
...   return m.sample(10, r)
>>> all((test_r().tolist() == test_r().tolist() for i in xrange(100)))
True

''')

def test_interactive():
   import cProfile

   #prof = cProfile.Profile()
   #prof.enable()
   u.logging_init('inter', verbose_=True)
   test_error_metrics()
   test_interactive_real()
Exemple #9
0
                                  u.fmt_sparsearray(self.data))

    def save(self, ignore=-1):
        self.total_update()
        if (self.total < ignore):
            return False
        if (self.total <= FRAGMENT_TOTAL_ZMAX):
            data = zlib.compress(self.data.data, ZLEVEL)
        else:
            data = self.data.data
        if (self.source == Fragment_Source.NEW):
            self.group.db.sql(
                """INSERT INTO data%d (name, dtype, total, data)
                              VALUES (?, ?, ?, ?)""" % self.shard,
                (self.name, self.data.dtype.char, self.total, data))
        else:
            self.group.db.sql(
                """UPDATE data%d
                              SET dtype=?, total=?, data=?
                              WHERE name=?""" % self.shard,
                (self.data.dtype.char, self.total, data, self.name))
        return True

    def total_update(self):
        # np.sum() returns a NumPy data type, which confuses SQLite somehow.
        # Therefore, use a plain Python float.
        self.total = float(np.nansum(np.abs(self.data)))


testable.register()
Exemple #10
0
    def __init__(self, token_iterator):
        assert False, 'unimplemented'
        pipeline.Model.__init__(
            self, [Token(token_iterator),
                   All_Tweets(token_iterator)])


### Tests ###

# Test passes as of sklearn.13-git
testable.register('''

# Test that fitting respects consistent random state.
>>> def test_r():
...   r = np.random.mtrand.RandomState(1234)
...   m = sklearn.mixture.GMM(n_components=2, random_state=r)
...   m.fit([1, 1.1, 2, 2.2])
...   return m.sample(10, r)
>>> all((test_r().tolist() == test_r().tolist() for i in xrange(100)))
True

''')


def test_interactive():
    import cProfile

    #prof = cProfile.Profile()
    #prof.enable()
    u.logging_init('inter', verbose_=True)
    test_error_metrics()
    test_interactive_real()
Exemple #11
0
testable.register('''

# Make sure random seed is set to a known value
>>> rand.random()
0.40224696110279223

# Memoized function fails with TypeError if passed an unhashable argument.
>>> @memoize
... def f(x):
...   return x*2
>>> f(dict())
Traceback (most recent call last):
  ...
TypeError: unhashable type: 'dict'

# Check that memoized reset() works by looking at exposed cache.
>>> f(1)
2
>>> f.cache
{(1,): 2}
>>> f.reset()
>>> f.cache
{}

# More slices. Basically, we want (almost) the same behavior as if we had
# typed the slice into the Python interpreter. The "and None" trick is simply
# to suppress output if the expression is true, so we don't have to keep
# typing "True".
>>> a = [0, 1, 2, 3, 4]
>>> (a[slp(':')] == a) and None
>>> (a[slp('0')] == [a[0]]) and None
>>> (a[slp('4')] == [a[4]]) and None
>>> a[slp('5')]
[]
>>> (a[slp('-1')] == [a[-1]]) and None
>>> (a[slp('-2')] == [a[-2]]) and None
>>> (a[slp('-5')] == [a[-5]]) and None
>>> a[slp('-6')]
[]
>>> (a[slp('1:')] == a[1:]) and None
>>> (a[slp(':1')] == a[:1]) and None
>>> (a[slp('-2:')] == a[-2:]) and None
>>> (a[slp(':-2')] == a[:-2]) and None
>>> (a[slp('1::')] == a[1::]) and None
>>> (a[slp('::1')] == a[::1]) and None
>>> (a[slp('2::')] == a[2::]) and None
>>> (a[slp('::2')] == a[::2]) and None
>>> (a[slp('-1::')] == a[-1::]) and None
>>> (a[slp('::-1')] == a[::-1]) and None

# More unioned slices
>>> sl_union(10)  # no slices
set()
>>> sl_union(0, slp('1'))  # empty list
set()
>>> sorted(sl_union(10, slp('1:4')))  # one slice
[1, 2, 3]
>>> sorted(sl_union(10, slp('1:4'), slp('3')))  # overlapping slices
[1, 2, 3]
>>> sl_union(10, slp('10'))  # fully out of bounds
set()
>>> sl_union(10, slp('9:11'))  # partly out of bounds
{9}
>>> sl_union(10, slp('9'), slp('10'))  # one in, one out
{9}

''')
Exemple #12
0
class TSV_Output_Job(Job):

   '''Mixin for TSV UTF-8 text output. :meth:`reduce_write()` expects a
      sequence of stringifiable objects.'''

   def reduce_open_output(self):
      assert False, 'unimplemented'

   def reduce_write(self, item):
      self.outfp.writerow(item)


testable.register(r'''

# Test data passing from mapper to reducer.
>>> import io
>>> buf = io.BytesIO()
>>> job = Test_Job()
>>> job.outfp = buf
>>> for kv in [(1, -1), (2, -2), (2, -3), (3, -4), (3, -5), (3, -6)]:
...    job.map_write(*kv)
>>> buf.getvalue()
b'1\tgASVBgAAAAAAAABK/////y4=\n2\tgASVBgAAAAAAAABK/v///y4=\n2\tgASVBgAAAAAAAABK/f///y4=\n3\tgASVBgAAAAAAAABK/P///y4=\n3\tgASVBgAAAAAAAABK+////y4=\n3\tgASVBgAAAAAAAABK+v///y4=\n'
>>> buf.seek(0)
0
>>> job.infp = buf
>>> [(k, list(v)) for (k, v) in job.reduce_inputs()]
[('1', [-1]), ('2', [-2, -3]), ('3', [-4, -5, -6])]

''')
Exemple #13
0
   def close(self):
      if (self.writable and self.locked):
         u.lock_release(self.filename)

   def commit(self):
      'Write data to disk.'
      assert (self.writable)
      fp = io.open(self.filename, mode='wb')
      pickle.dump(self.data, fp, pickle.HIGHEST_PROTOCOL)


testable.register('''

>>> import os
>>> import tempfile
>>> testfile = tempfile.mktemp()
>>> a = File(testfile, default=[1,2,3], writable=True)
>>> a.data
[1, 2, 3]
>>> a.data.append(4)
>>> a.data
[1, 2, 3, 4]
>>> a.commit()
>>> del a
>>> b = File(testfile)
>>> b.data
[1, 2, 3, 4]
>>> os.unlink(testfile)

''')
Exemple #14
0
testable.register('''

# Make sure the SRIDs we're interested in are available.
>>> for srid in (4326, 54003, 540033, 540036, 54009, 540093, 540096):
...   if not isinstance(SRS[srid], gdal.SpatialReference): srid

# Test that we can transform to and from the custom SRSes.
>>> a = geos.Point(1, 2, srid=SRID_WGS84)
>>> b = transform(a, 540036)
>>> a.srid
4326
>>> b.coords
(0.111..., 0.220...)
>>> b.srid
540036
>>> c = transform(b, 4326)
>>> c.srid
4326
>>> [round(x, 4) for x in c.coords]
[1.0, 2.0]

# geodesic_area() should except if we give it a bogus geometry type.
>>> geodesic_area(geos.Point(0,0))
Traceback (most recent call last):
  ...
TypeError: need Polygon or MultiPolygon, not Point

# inbounds_p() should work north/sound and on SRS that requires transform
>>> inbounds_p(geos.Point(0, 89.98, srid=SRID_WGS84))
True
>>> inbounds_p(geos.Point(0, 90.01, srid=SRID_WGS84))
False
>>> inbounds_p(geos.Point(0, -89.98, srid=SRID_WGS84))
True
>>> inbounds_p(geos.Point(0, -90.01, srid=SRID_WGS84))
False
>>> inbounds_p(geos.Point(0, 14671436.0, srid=54003))
True
>>> inbounds_p(geos.Point(0, 14671436.1, srid=54003))
False
>>> inbounds_p(geos.Point(0, -14671436.0, srid=54003))
True
>>> inbounds_p(geos.Point(0, -14671436.1, srid=54003))
False

# Ensure that trim() works on multipolygons.
>>> yo = 15e6
>>> yi = 14e6
>>> mp = geos.MultiPoint([geos.Point(0, yi), geos.Point(0, yo)], srid=54003)
>>> trim(mp).coords
(0.0, 14000000.0)

''')
Exemple #15
0
    def close(self):
        if (self.writable and self.locked):
            u.lock_release(self.filename)

    def commit(self):
        'Write data to disk.'
        assert (self.writable)
        fp = io.open(self.filename, mode='wb')
        pickle.dump(self.data, fp, pickle.HIGHEST_PROTOCOL)


testable.register('''

>>> import os
>>> import tempfile
>>> testfile = tempfile.mktemp()
>>> a = File(testfile, default=[1,2,3], writable=True)
>>> a.data
[1, 2, 3]
>>> a.data.append(4)
>>> a.data
[1, 2, 3, 4]
>>> a.commit()
>>> del a
>>> b = File(testfile)
>>> b.data
[1, 2, 3, 4]
>>> os.unlink(testfile)

''')
Exemple #16
0
testable.register('''

# test that Date_Vector objects can be pickled
>>> import pickle
>>> a = Date_Vector('2013-06-02', np.arange(2, 7))
>>> b = pickle.loads(pickle.dumps(a))
>>> np.array_equal(a, b)
True
>>> a.first_day == b.first_day
True

# make sure repr() objects really can be eval()'ed
>>> b = eval(repr(a))
>>> np.array_equal(a, b)
True
>>> a.first_day == b.first_day
True

# do methods that should return scalars do so?
>>> c = np.arange(2, 7)
>>> c.sum()
20
>>> type(c.sum())
<class 'numpy.int64'>
>>> a.sum()
20
>>> type(a.sum())
<class 'numpy.int64'>

''')
Exemple #17
0
class LocalTimezone(datetime.tzinfo):

    def utcoffset(self, dt):
        if self._isdst(dt):
            return DSTOFFSET
        else:
            return STDOFFSET

    def dst(self, dt):
        if self._isdst(dt):
            return DSTDIFF
        else:
            return ZERO

    def tzname(self, dt):
        return time.tzname[self._isdst(dt)]

    def _isdst(self, dt):
        tt = (dt.year, dt.month, dt.day,
              dt.hour, dt.minute, dt.second,
              dt.weekday(), 0, 0)
        stamp = time.mktime(tt)
        tt = time.localtime(stamp)
        return tt.tm_isdst > 0

local_tz = LocalTimezone()


testable.register('')
Exemple #18
0
def init(core_ct_):
   '''This is here because doctest is not able to set module globals without
      fooling around (this is by design). Perhaps in the future it will have a
      real purpose as well. You do not need to call it, as there are sensible
      defaults (in particular, core_ct = 1 -- you must ask for parallelism).'''
   assert (core_ct_ >= 1)
   global core_ct
   core_ct = core_ct_


testable.register('''

# Does require_multicore work?
>>> init(1)
>>> do(f_test, (1, 2), [(4, 8), (16, 32)])
[15, 51]
>>> do(f_test, (1, 2), [(4, 8), (16, 32)], require_multicore=True)
Traceback (most recent call last):
  ...
ValueError: multicore forced, but core_ct == 1
>>> init(2)
>>> do(f_test, (1, 2), [(4, 8), (16, 32)], require_multicore=True)
[15, 51]

# Don't crash if the length of every is less than core_ct
>>> init(4)
>>> do(f_test, (1, 2), [(1, 1), (2, 2), (3, 3)])
[5, 7, 9]

''')
Exemple #19
0
def init(core_ct_):
   '''This is here because doctest is not able to set module globals without
      fooling around (this is by design). Perhaps in the future it will have a
      real purpose as well. You do not need to call it, as there are sensible
      defaults (in particular, core_ct = 1 -- you must ask for parallelism).'''
   assert (core_ct_ >= 1)
   global core_ct
   core_ct = core_ct_


testable.register('''

# Does require_multicore work?
>>> init(1)
>>> do(f_test, (1, 2), [(4, 8), (16, 32)])
[15, 51]
>>> do(f_test, (1, 2), [(4, 8), (16, 32)], require_multicore=True)
Traceback (most recent call last):
  ...
ValueError: multicore forced, but core_ct == 1
>>> init(2)
>>> do(f_test, (1, 2), [(4, 8), (16, 32)], require_multicore=True)
[15, 51]

# Don't crash if the length of every is less than core_ct
>>> init(4)
>>> do(f_test, (1, 2), [(1, 1), (2, 2), (3, 3)])
[5, 7, 9]

''')
Exemple #20
0
T_TW_SIMPLE = Tweet.from_dict({ 'tweet_id':          -1,
                                'created_at':        datetime.now(),
                                'text':              'a b',
                                'user_screen_name':  'c',
                                'user_description':  'd',
                                'user_lang':         'e',
                                'user_location':     'f',
                                'user_time_zone':    'g',
                                'geom':              None,
                                'geom_src':          None })
T_TW_JSON_CO = r'''{"text":"Guantes, bufanda, tenis y chamarra :) #Viena","id_str":"186339941163339776","contributors":null,"in_reply_to_status_id_str":null,"geo":{"type":"Point","coordinates":[48.24424304,16.37778864]},"retweet_count":0,"in_reply_to_status_id":null,"favorited":false,"in_reply_to_user_id":null,"source":"\u003Ca href=\"http:\/\/twitter.com\/#!\/download\/iphone\" rel=\"nofollow\"\u003ETwitter for iPhone\u003C\/a\u003E","created_at":"Sun Apr 01 06:31:18 +0000 2012","in_reply_to_user_id_str":null,"truncated":false,"entities":{"urls":[],"hashtags":[{"text":"Viena","indices":[38,44]}],"user_mentions":[]},"coordinates":{"type":"Point","coordinates":[16.37778864,48.24424304]},"place":{"country":"Austria","place_type":"city","url":"http:\/\/api.twitter.com\/1\/geo\/id\/9f659d51e5c5deae.json","country_code":"AT","bounding_box":{"type":"Polygon","coordinates":[[[16.182302,48.117666],[16.577511,48.117666],[16.577511,48.322574],[16.182302,48.322574]]]},"attributes":{},"full_name":"Vienna, Vienna","name":"Vienna","id":"9f659d51e5c5deae"},"in_reply_to_screen_name":null,"user":{"profile_background_color":"8B542B","id_str":"249409866","profile_background_tile":true,"screen_name":"montse_moso","listed_count":3,"time_zone":"Mexico City","profile_sidebar_fill_color":"ffffff","description":"you  It's exhausting being this Juicy \u2764","default_profile":false,"profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","created_at":"Wed Feb 09 00:21:15 +0000 2011","profile_sidebar_border_color":"f03368","is_translator":false,"contributors_enabled":false,"geo_enabled":true,"url":null,"profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2003516916\/image_normal.jpg","follow_request_sent":null,"profile_use_background_image":true,"lang":"es","verified":false,"profile_text_color":"333333","protected":false,"default_profile_image":false,"show_all_inline_media":false,"notifications":null,"profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/442998413\/ipod_tamborin.jpg","location":"","name":"Montse Alcaraz ","favourites_count":415,"profile_link_color":"9D582E","id":249409866,"statuses_count":5252,"following":null,"utc_offset":-21600,"friends_count":368,"followers_count":191,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2003516916\/image_normal.jpg"},"retweeted":false,"id":186339941163339776}'''
# FIXME: add test tweets for the other geotag sources


testable.register('''

# Make sure we don't drop anything through all the parsing and unparsing.
>>> a = from_json(T_TW_JSON_CO)
>>> a.geom_src
'co'
>>> a.created_at
datetime.datetime(2012, 4, 1, 6, 31, 18, tzinfo=<UTC>)
>>> a.day
'2012-04-01'
>>> a == Tweet.from_list(a.to_list())
True
>>> a == Tweet.from_dict(a.to_dict())
True

''')
Exemple #21
0
   u'''Splits on whitespace, then uses ICU for Latin, Tiny for Japanese.
       Ignores everything else. E.g.:

       >>> Tiny_ICU(1).tokenize(base.T_JP + ' ' + base.T_FR) == base.T_JP_TOKS + base.T_FR_TOKS
       True
       >>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
       True'''

   def __init__(self, ngram):
      base.Tzer.__init__(self, ngram)
      self.tiny = tiny.Tzer(ngram)
      self.icu = ICU(ngram)

   def tokenize_real(self, text):
      ws_tokens = text.split()
      tokens = []
      for ws_token in ws_tokens:
         if (is_latin(ws_token)):
            tokens.extend(self.icu.tokenize(ws_token))
         elif (is_japanese(ws_token)):
            tokens.extend(self.tiny.tokenize(ws_token))
      return tokens


testable.register(u'''

>>> Tiny_ICU(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
True

''')
Exemple #22
0
testable.register('''

# test that Date_Vector objects can be pickled
>>> import pickle
>>> a = Date_Vector('2013-06-02', np.arange(2, 7))
>>> b = pickle.loads(pickle.dumps(a))
>>> np.array_equal(a, b)
True
>>> a.first_day == b.first_day
True

# make sure repr() objects really can be eval()'ed
>>> b = eval(repr(a))
>>> np.array_equal(a, b)
True
>>> a.first_day == b.first_day
True

# do methods that should return scalars do so?
>>> c = np.arange(2, 7)
>>> c.sum()
20
>>> type(c.sum())
<class 'numpy.int64'>
>>> a.sum()
20
>>> type(a.sum())
<class 'numpy.int64'>

''')
Exemple #23
0
testable.register(u'''

# FIXME: I haven't figured out how to print the actual Unicode characters in
# order to test them in a natural way. For example, letting the doctest
# "shell" print a Unicode string gets you a heavily encoded string full of
# "\u79c1" escape sequences rather than the characters themselves (you can use
# print to make an individual string work, but that doesn't help for
# sequences). Hence all the tests against True rather than a list.

# Tokenizers should return the empty sequence in some cases
>>> Whitespace(1).tokenize(None)
[]
>>> Whitespace(1).tokenize('')
[]

# ngram <= 1 is an error
>>> Whitespace(0).tokenize(None)
Traceback (most recent call last):
   ...
ValueError: ngram must be >= 1, but 0 given

# Test ngrams
>>> Whitespace(1).tokenize('a b c')
['a', 'b', 'c']
>>> Whitespace(2).tokenize('a b c')
['a', 'b', 'c', 'a b', 'b c']
>>> Whitespace(3).tokenize('a b c')
['a', 'b', 'c', 'a b', 'b c', 'a b c']

''')
Exemple #24
0
                else:
                    tokens.append(cand.lower())
        return tokens


testable.register(
    u"""

>>> all([s in unicodedata2.script_data['names']
...      for s in UP_Tiny.DISCARD_SCRIPTS])
True
>>> all([s in unicodedata2.script_data['names']
...      for s in UP_Tiny.JP_SCRIPTS])
True
>>> UP_Tiny(1).tokenize(base.T_EN) == base.T_EN_TOKS
True
>>> UP_Tiny(1).tokenize(base.T_FR) == base.T_FR_TOKS
True
>>> UP_Tiny(1).tokenize(base.T_JP) == base.T_JP_TOKS
True
>>> (UP_Tiny(1).tokenize(base.T_JP + ' ' + base.T_FR)
...  == base.T_JP_TOKS + base.T_FR_TOKS)
True
>>> UP_Tiny(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
True
>>> UP_Tiny(1).tokenize(base.T_WEIRD) == base.T_WEIRD_TOKS
True

"""
)
Exemple #25
0
import tinysegmenter

from . import base
import testable


class Tzer(base.Tzer):
    '''A wrapper for the TinySegmenter tokenizer for Japanese. e.g.:

      >>> Tzer(1).tokenize(base.T_JP) == base.T_JP_TOKS
      True'''
    def __init__(self, ngram):
        base.Tzer.__init__(self, ngram)
        self.seg = tinysegmenter.TinySegmenter()

    def tokenize_real(self, text):
        return [i.lower() for i in self.seg.tokenize(text)]


testable.register('')
Exemple #26
0
        4    5
        dtype: float64
        >>> (X_, y_) = trim_for_fit(X, y, minfinite=1)
        Traceback (most recent call last):
           ...
        Degenerate_Fit_Error: 1 rows left, min=2
        >>> (X_, y_) = trim_for_fit(X, y, minfinite=1, minrows=1)
        >>> X_
           a  b  c
        4  5  5  5
        >>> y_
        4    5
        dtype: float64"""
    assert len(X) == len(y)
    y_keep = pd.notnull(y)
    X_keep = ((X != 0) & X.notnull()).astype(int).sum(axis=1) >= minfinite * len(X.columns)
    mask = y_keep & X_keep
    X = X.loc[mask]
    y = y.loc[mask]
    assert len(X) == len(y)
    if len(X) < minrows:
        raise Degenerate_Fit_Error("%d rows left, min=%d" % (len(X), minrows))
    return (X, y)


# Since this stuff is experimental, we don't make the standard test suite
# depend on it.
#
# Test-Depends: manual
testable.register()
Exemple #27
0
                if (key[0] in self.JP_SCRIPTS):
                    tokens.extend(self.tiny.tokenize(cand))
                else:
                    tokens.append(cand.lower())
        return tokens


testable.register(u'''

>>> all([s in unicodedata2.script_data['names']
...      for s in UP_Tiny.DISCARD_SCRIPTS])
True
>>> all([s in unicodedata2.script_data['names']
...      for s in UP_Tiny.JP_SCRIPTS])
True
>>> UP_Tiny(1).tokenize(base.T_EN) == base.T_EN_TOKS
True
>>> UP_Tiny(1).tokenize(base.T_FR) == base.T_FR_TOKS
True
>>> UP_Tiny(1).tokenize(base.T_JP) == base.T_JP_TOKS
True
>>> (UP_Tiny(1).tokenize(base.T_JP + ' ' + base.T_FR)
...  == base.T_JP_TOKS + base.T_FR_TOKS)
True
>>> UP_Tiny(1).tokenize(base.T_PUNCT) == base.T_PUNCT_TOKS
True
>>> UP_Tiny(1).tokenize(base.T_WEIRD) == base.T_WEIRD_TOKS
True

''')
Exemple #28
0
                continue


class TSV_Output_Job(Job):
    '''Mixin for TSV UTF-8 text output. :meth:`reduce_write()` expects a
      sequence of stringifiable objects.'''
    def reduce_open_output(self):
        assert False, 'unimplemented'

    def reduce_write(self, item):
        self.outfp.writerow(item)


testable.register(r'''

# Test data passing from mapper to reducer.
>>> from cStringIO import StringIO
>>> buf = StringIO()
>>> job = Test_Job()
>>> job.outfp = buf
>>> for kv in [(1, -1), (2, -2), (2, -3), (3, -4), (3, -5), (3, -6)]:
...    job.map_write(*kv)
>>> buf.getvalue()
'1\tgAJK/////y4=\n2\tgAJK/v///y4=\n2\tgAJK/f///y4=\n3\tgAJK/P///y4=\n3\tgAJK+////y4=\n3\tgAJK+v///y4=\n'
>>> buf.seek(0)
>>> job.infp = buf
>>> [(k, list(v)) for (k, v) in job.reduce_inputs()]
[(u'1', [-1]), (u'2', [-2, -3]), (u'3', [-4, -5, -6])]

''')