def test_numeric_substitution(self): hasher = SRP.SRP(36) string1 = "I was born in 2001" string2 = "I was born in 1907" h1 = hasher.stable_transform(string1, log=False, standardize=True) h2 = hasher.stable_transform(string1, log=False, standardize=True) self.assertEqual(h1.tolist(), h2.tolist())
def test_unicode(self): """ One of the goals is be able to pass *either* encoded or decoded utf-8, because that tends to happen. These tests are a lot easier to pass now that python2 is deprecated. """ hasher = SRP.SRP(6) guten = u"Güten Tag" gutenhash = np.array([0., 2., -2., 0., 2., 0.]).tolist() basic = hasher.stable_transform(guten, log=False, unit_length=False).tolist() self.assertTrue(basic == gutenhash) encoded = hasher.stable_transform(guten.encode("utf-8"), log=False, unit_length=False).tolist() self.assertTrue(encoded == gutenhash) decoded = hasher.stable_transform( guten.encode("utf-8").decode("utf-8"), log=False, unit_length=False).tolist() self.assertTrue(decoded == gutenhash)
def test_ascii(self): hasher = SRP.SRP(6) hello_world = hasher.stable_transform("hello world", log=False, unit_length=False) self.assertEqual(hello_world.tolist(), np.array([0., 0., 2., 0., 2., 0.]).tolist())
def SRP_transform(f): global hasher if hasher is None: hasher = SRP.SRP(640) return hasher.stable_transform(words=f['token'], counts=f['count'], log=True, standardize=True)
def test_ascii_equals_unicode(self): hasher = SRP.SRP(160) hello_world = hasher.stable_transform("hello world", log=False).tolist() hello_world_unicode = hasher.stable_transform(u"hello world", log=False).tolist() self.assertEqual(hello_world, hello_world_unicode)
def test_logs_are_plausible(self): log_unit = np.log(1e05) hasher = SRP.SRP(20) log_srp = hasher.stable_transform("hello", log=True) nonlog_srp = hasher.stable_transform("hello", log=False) difference = sum(log_srp - (nonlog_srp) * log_unit) # Forgive floating point error. self.assertTrue(difference < 1e-05)
def test_wordcounts_unicode(self): hasher = SRP.SRP(160) wordcount_style = hasher.stable_transform(words=[u"Güten", u"Tag"], counts=[1, 1], log=False).tolist() string_style = hasher.stable_transform(words=u"Güten Tag", log=False).tolist() self.assertEqual(wordcount_style, string_style)
def test_standardization(self): """ standardization does case normalization, and tokenizes by a charater regex. """ hasher = SRP.SRP(6) string1 = "Gravity's rainbow" hashed_standardized = hasher.stable_transform(string1, log=False, standardize=True) manually_tokenized = ["Gravity", "s", "RAINBOW"] hashed_manually_tokenized = hasher.stable_transform(manually_tokenized, [1, 1, 1], log=False, standardize=True) self.assertEqual(hashed_manually_tokenized.tolist(), hashed_standardized.tolist())
def textset_to_srp(inputfile, outputfile, dim=640, limit=float("Inf"), log=True): """ A convenience wrapper for converting a text corpus to an SRP collection as a block. inputfile is the collection. The format is the same as the ingest format used by bookworm and mallet; that is to say, a single unicode file where each line is a document represented as a unique filename, then a tab, and then a long string containing the text of the document. To coerce into the this format, newlines must be removed. inputfile can also be a **directory** of txt files. outputfile is the SRP file to be created. Recommended suffix is `.bin`. dims is the dimensionality of the output SRP. """ import SRP output = Vector_file(outputfile, dims=dim, mode="w") hasher = SRP.SRP(dim=dim) if inputfile.endswith(".txt"): yielder = textset_yielder elif os.path.isdir(inputfile): yielder = directory_yielder else: raise ValueError( "Don't know how to process {}: must be a textfile or a directory". format(inputfile)) for i, (id, txt) in enumerate(yielder(inputfile)): transform = hasher.stable_transform(txt, log=True, standardize=True) output.add_row(id, transform) if i + 1 >= limit: break output.close()
def main(): parser = argparse.ArgumentParser(description="Convert Extracted Features files to vectors, and save in SRP's Vector_file format.") parser.add_argument('threadno', type=int, help='Non-zero indexed number of thread.') parser.add_argument('totalthreads', type=int, help='Number of threads running in total.') parser.add_argument('idlist', type=str, help='CSV file of HTIDs to process. Needs a header, with column name \'htid\'.') parser.add_argument('--outdir', '-o', type=str, default='data_outputs/', help='Directory to save results.') parser.add_argument('--chunksize', '-c', type=int, default=10000, help='Size of chunks to roll pages into. -1 will make books into a single full book "chunk".'') parser.add_argument('--no-srp', action='store_true', help='Turn off SRP saving') parser.add_argument('--no-glove', action='store_true', help='Turn off Glove saving') parser.add_argument('--glove-dims', '-g', type=int, default=300, help='Number of GloVe dimensions. Can be 50, 100, 200, or 300.') parser.add_argument('--srp-dims', '-s', type=int, default=640, help='Number of SRP dimensions.') parser.add_argument('--efdir', type=str, default=None, help='Set an alternative base url for EF files, if not using the setting from local.yaml or ~/.htrc-config.yaml.') args = parser.parse_args() thread_no = args.threadno - 1 # Zero index. assert not (args.no_srp & args.no_glove) if args.efdir is not None: customizable_resolver.dir = args.efdir already_imported = already_imported_list(args.outdir) print("There are {} files already imported".format(len(already_imported))) filenames = pd.read_csv(args.idlist, low_memory = False) thread_name = "{}-of-{}_".format(thread_no + 1, args.totalthreads) already_seen_file = open(os.path.join(args.outdir, "already_completed_files{}.csv".format(thread_name)), "a") if not args.no_srp: hasher = SRP.SRP(args.srp_dims) out_SRP = SRP.Vector_file(os.path.join(args.outdir, thread_name + "SRP_chunks.bin"), dims=args.srp_dims, mode="w") def SRP_transform(f): return hasher.stable_transform(words = f['lowercase'], counts = f['count'], log = True, standardize = True) if not args.no_glove: wem_model = wem_loader('glove-wiki-gigaword-{}'.format(args.glove_dims)) # Cross-ref with stoplist and drop stopped words from spacy.lang.en.stop_words import STOP_WORDS wem_vocab = set(wem_model.vocab.keys()) wem_vocab = wem_vocab.difference(STOP_WORDS) out_glove = SRP.Vector_file(os.path.join(args.outdir, thread_name + "Glove_chunks.bin"), dims = args.glove_dims, mode="w") def WEM_transform(f): return transformations.chunk_to_wem(f, wem_model, vocab=wem_vocab, stop=False, log=True, min_ncount=10) books = 0 last = None start_time = time.time() try: gen = yielder(filenames.htid, thread_no, args.totalthreads, chunk_size=args.chunksize, already_imported_list=already_imported) for i, (id, chunk, start, end, group) in enumerate(gen): # Count books too. if last != id: books += 1 if last is not None: already_seen_file.write("{}\n".format(last)) if (books % 25 == 0): rate = books/(time.time()-start_time) print("{} books done on thread {} of {}, {:.02f} chunks per book, {:.02f} books per second".format(books, thread_no + 1, args.totalthreads, i/books, rate)) last = id id = "{}-{:04d}-{}-{}".format(id, chunk, start, end) if not args.no_srp: SRP_rep = SRP_transform(group) out_SRP.add_row(id, SRP_rep) if not args.no_glove: WEM_rep = WEM_transform(group) if WEM_rep.shape[0] != args.glove_dims: print(WEM_rep.shape, args.glove_dims) try: out_glove.add_row(id, WEM_rep.astype('<f4')) except: print(id, WEM_rep.shape, args.glove_dims, wem_model.vector_size) raise already_seen_file.write("{}\n".format(last)) if not args.no_srp: out_SRP.close() if not args.no_glove: out_glove.close() except KeyboardInterrupt: if not args.no_srp: out_SRP.close() if not args.no_glove: out_glove.close()
def test_dtype(self): hasher = SRP.SRP(6) hello_world = hasher.stable_transform("hello world", log=False) self.assertEqual(hello_world.dtype, np.float32)