def test_from_csv(): tweets = nvstrings.from_csv("../../data/tweets.csv", 7) got = tweets[:5] expected = [ "@Bill_Porter nice to know that your site is back :-)", "@sudhamshu after trying out various tools to take notes and I found that paper is the best to take notes and to maintain todo lists.", "@neetashankar Yeah, I got the connection. I am getting 20 mbps for a 15 mbps connection. Customer service is also good.", '@Bill_Porter All posts from your website http://t.co/NUWn5HUFsK seems to have been deleted. I am getting a ""Not Found"" page even in homepage', 'Today is ""bring your kids"" day at office and the entire office is taken over by cute little creatures ;)', ] assert_eq(got, expected)
import nvstrings import time #df = pd.read_csv('/home/jovyan/reviews-1m.csv', sep=',') #values = df["text"].values #vlist = values.tolist() print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) for i in range(3): lines = (i+1) * 1000000 #vlist.extend(vlist) #print("strings:",len(vlist)) # #dstrs = nvstrings.to_device(vlist) dstrs = nvstrings.from_csv("/home/jovyan/reviews.txt",0,lines=lines) vlist = dstrs.to_host() print("strings = ",len(vlist)) hstrs = pd.Series(vlist) # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) d = dstrs.slice(3,103) et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print("nvstrings.slice() = %05f" % et1) # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) h = hstrs.str.slice(3,103) et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print(" pandas.slice() = %05f" % et2) print("speedup = %0.1fx" % (et2/et1) ) #
print(strs.size(),strs) cat = cat.remove_strings(strs) print(cat.size(),cat.keys()) print(".values():",cat.values()) print(".value_for_index(7)",cat.value_for_index(7)) print(".value(fff):",cat.value('fff')) print(".indexes_for_key(fff):",cat.indexes_for_key('fff')) print(".to_strings():",cat.to_strings()) # multiple strings in one call print("-------------------------") strs1 = nvstrings.to_device(["eee","aaa","eee","ddd","ccc","ccc","ccc","eee","aaa"]) strs2 = nvstrings.to_device(["ggg","fff","hhh","aaa","fff","fff","ggg","hhh","bbb"]) print(".from_strings(strs1,strs2)") cat = nvcategory.from_strings(strs1,strs2) print(cat.size(),cat) print(".values():",cat.values()) print(".value(ccc):",cat.value('ccc')) print(".indexes_for_key(ccc):",cat.indexes_for_key('ccc')) print(".gather_strings([0,2,0,3,1]):",cat.gather_strings([0,2,0,3,1])) # Masonry, Reinforced Concrete, Reinforced Masonry, Steel Frame, Wood print("-------------------------") print("36634-rows.csv:") strs = nvstrings.from_csv("../../data/36634-rows.csv",16) cat = nvcategory.from_strings(strs) print(cat.size(),cat.keys()) print("len(.values()):",len(cat.values())) print(".value(Wood):",cat.value('Wood'))
import pandas as pd import nvstrings import time dstrs_in = nvstrings.from_csv('../tweets.csv', 7) vlist = dstrs_in.to_host() vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) len(vlist) dstrs = nvstrings.to_device(vlist) hstrs = pd.Series(vlist) print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) print("strings =", dstrs.size()) # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) d = dstrs.contains('@.+@') et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print("nvstrings.contains('@.+@') = %05f" % et1) st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) h = hstrs.str.contains('@.+@') et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)
import pandas as pd import nvstrings import time # setup rmm to use memory pool from librmm_cffi import librmm as rmm from librmm_cffi import librmm_config as rmm_cfg rmm_cfg.use_pool_allocator = True rmm_cfg.initial_pool_size = 8 << 30 # 8GB rmm.initialize() strs = nvstrings.from_csv('/data/tweets.csv', 7).to_host() vlist1 = [] vlist1.extend(strs) vlist1.extend(strs) vlist1.extend(strs) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) stats = {'strings': [], 'pandas': [], 'nvstrings': []} vlist = [] for i in range(20):
import pandas as pd import nvstrings import time strs = nvstrings.from_csv('/home/jovyan/tweets.csv', 7).to_host() vlist1 = [] vlist1.extend(strs) vlist1.extend(strs) vlist1.extend(strs) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) vlist1.extend(vlist1) print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) stats = {'strings': [], 'pandas': [], 'nvstrings': []} vlist = [] for i in range(50): # vlist.extend(vlist1) stats['strings'].append(len(vlist)) # dstrs = nvstrings.to_device(vlist) hstrs = pd.Series(vlist) #
import nvstrings strs = nvstrings.from_csv('../../data/tweets.csv', 7) print("slice(1,15):", strs.slice(1, 15))
import nvstrings, nvcategory import numpy as np from numba import cuda import time print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) lines = 1000000 # column 5 = style (e.g. American Pale Ale, Vienna Lager, etc) dstrs = nvstrings.from_csv("/home/dwendt/data/reviews/beers-1m.csv", 5, lines=lines) #input("press enter") # slist = [] stats1 = [] stats2 = [] for i in range(50): idx = i + 1 #print(idx,'million') # slist.append(dstrs) # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) cat = nvcategory.from_strings_list(slist) et = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print(cat.keys_size(), cat.size()) print(" from_strings_list = %05f" % et) stats1.append(et)
# import nvstrings # strs = nvstrings.from_csv("../../data/7584-rows.csv", 1) #print(strs) cols = strs.split_column(" ", 2) print(cols[1]) #print(cols[1].len())
import nvstrings import numpy as np from numba import cuda import time print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) lines = 1000000 dstrs = nvstrings.from_csv("/home/dwendt/data/reviews/reviews.txt",0,lines=lines) # # there are 14 of these: rwords = ["fruit","vintage","zest","foam","sweet","juic","malt","wheat","citrus","pine","crisp","dark","golden","bitter"] # there are 133 of these: swords =['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself', 'yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself', 'they','them','their','theirs','themselves','what','which','who','whom','this','that', 'these','those','am','is','are','was','were','be','been','being','have','has','had', 'having','do','does','did','doing','a','an','the','and','but','if','or','because','as', 'until','while','of','at','by','for','with','about','against','between','into','through', 'during','before','after','above','below','to','from','up','down','in','out','on','off', 'over','under','again','further','then','once','here','there','when','where','why','how', 'all','any','both','each','few','more','most','other','some','such','no','nor','not', 'only','own','same','so','than','too','very','s','t','can','will','just','don','should', 'now','uses','use','using','used','one','also'] stats = [] for i in range(len(rwords)): # words = rwords[0:i+1] print(words)
import pandas as pd import nvstrings import time dstrs_in = nvstrings.from_csv('/data/tweets.csv', 7) vlist = dstrs_in.to_host() vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) vlist.extend(vlist) len(vlist) dstrs = nvstrings.to_device(vlist) hstrs = pd.Series(vlist) print("precision = %0.9f seconds" % time.clock_getres(time.CLOCK_MONOTONIC_RAW)) print("strings =", dstrs.size()) # st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) d = dstrs.contains('@.+@') et1 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st) print("nvstrings.contains('@.+@') = %05f" % et1) st = time.clock_gettime(time.CLOCK_MONOTONIC_RAW) h = hstrs.str.contains('@.+@') et2 = (time.clock_gettime(time.CLOCK_MONOTONIC_RAW) - st)