Esempio n. 1
0
 def test_numeric_substitution(self):
     hasher = SRP.SRP(36)
     string1 = "I was born in 2001"
     string2 = "I was born in 1907"
     h1 = hasher.stable_transform(string1, log=False, standardize=True)
     h2 = hasher.stable_transform(string1, log=False, standardize=True)
     self.assertEqual(h1.tolist(), h2.tolist())
Esempio n. 2
0
    def test_unicode(self):
        """
        One of the goals is be able to pass *either* encoded or decoded
        utf-8, because that tends to happen.

        These tests are a lot easier to pass now that python2 is deprecated.

        """
        hasher = SRP.SRP(6)
        guten = u"Güten Tag"
        gutenhash = np.array([0., 2., -2., 0., 2., 0.]).tolist()

        basic = hasher.stable_transform(guten, log=False,
                                        unit_length=False).tolist()
        self.assertTrue(basic == gutenhash)

        encoded = hasher.stable_transform(guten.encode("utf-8"),
                                          log=False,
                                          unit_length=False).tolist()
        self.assertTrue(encoded == gutenhash)

        decoded = hasher.stable_transform(
            guten.encode("utf-8").decode("utf-8"),
            log=False,
            unit_length=False).tolist()
        self.assertTrue(decoded == gutenhash)
Esempio n. 3
0
    def test_ascii(self):
        hasher = SRP.SRP(6)
        hello_world = hasher.stable_transform("hello world",
                                              log=False,
                                              unit_length=False)

        self.assertEqual(hello_world.tolist(),
                         np.array([0., 0., 2., 0., 2., 0.]).tolist())
Esempio n. 4
0
def SRP_transform(f):
    global hasher
    if hasher is None:
        hasher = SRP.SRP(640)
    return hasher.stable_transform(words=f['token'],
                                   counts=f['count'],
                                   log=True,
                                   standardize=True)
Esempio n. 5
0
    def test_ascii_equals_unicode(self):
        hasher = SRP.SRP(160)

        hello_world = hasher.stable_transform("hello world",
                                              log=False).tolist()
        hello_world_unicode = hasher.stable_transform(u"hello world",
                                                      log=False).tolist()

        self.assertEqual(hello_world, hello_world_unicode)
Esempio n. 6
0
    def test_logs_are_plausible(self):
        log_unit = np.log(1e05)

        hasher = SRP.SRP(20)
        log_srp = hasher.stable_transform("hello", log=True)
        nonlog_srp = hasher.stable_transform("hello", log=False)
        difference = sum(log_srp - (nonlog_srp) * log_unit)

        # Forgive floating point error.
        self.assertTrue(difference < 1e-05)
Esempio n. 7
0
    def test_wordcounts_unicode(self):
        hasher = SRP.SRP(160)

        wordcount_style = hasher.stable_transform(words=[u"Güten", u"Tag"],
                                                  counts=[1, 1],
                                                  log=False).tolist()

        string_style = hasher.stable_transform(words=u"Güten Tag",
                                               log=False).tolist()

        self.assertEqual(wordcount_style, string_style)
Esempio n. 8
0
 def test_standardization(self):
     """
     standardization does case normalization,
     and tokenizes by a charater regex.
     """
     hasher = SRP.SRP(6)
     string1 = "Gravity's rainbow"
     hashed_standardized = hasher.stable_transform(string1,
                                                   log=False,
                                                   standardize=True)
     manually_tokenized = ["Gravity", "s", "RAINBOW"]
     hashed_manually_tokenized = hasher.stable_transform(manually_tokenized,
                                                         [1, 1, 1],
                                                         log=False,
                                                         standardize=True)
     self.assertEqual(hashed_manually_tokenized.tolist(),
                      hashed_standardized.tolist())
Esempio n. 9
0
def textset_to_srp(inputfile,
                   outputfile,
                   dim=640,
                   limit=float("Inf"),
                   log=True):
    """
    A convenience wrapper for converting a text corpus to
    an SRP collection as a block.

    inputfile is the collection. The format is the same as the ingest
    format used by bookworm and mallet; that is to say, a single unicode
    file where each line is a document represented as a unique filename, then a tab,
    and then a long string containing the text of the document.
    To coerce into the this format, newlines must be removed.

    inputfile can also be a **directory** of txt files.

    outputfile is the SRP file to be created. Recommended suffix is `.bin`.

    dims is the dimensionality of the output SRP.

    """
    import SRP

    output = Vector_file(outputfile, dims=dim, mode="w")
    hasher = SRP.SRP(dim=dim)

    if inputfile.endswith(".txt"):
        yielder = textset_yielder
    elif os.path.isdir(inputfile):
        yielder = directory_yielder
    else:
        raise ValueError(
            "Don't know how to process {}: must be a textfile or a directory".
            format(inputfile))

    for i, (id, txt) in enumerate(yielder(inputfile)):
        transform = hasher.stable_transform(txt, log=True, standardize=True)
        output.add_row(id, transform)
        if i + 1 >= limit:
            break

    output.close()
def main():
    parser = argparse.ArgumentParser(description="Convert Extracted Features files to vectors, and save in SRP's Vector_file format.")

    parser.add_argument('threadno', type=int, help='Non-zero indexed number of thread.')
    parser.add_argument('totalthreads', type=int, help='Number of threads running in total.')
    parser.add_argument('idlist', type=str, help='CSV file of HTIDs to process. Needs a header, with column name \'htid\'.')
    parser.add_argument('--outdir', '-o', type=str, default='data_outputs/', help='Directory to save results.')
    parser.add_argument('--chunksize', '-c', type=int, default=10000, help='Size of chunks to roll pages into. -1 will make books into a single full book "chunk".'')
    parser.add_argument('--no-srp', action='store_true', help='Turn off SRP saving')
    parser.add_argument('--no-glove', action='store_true', help='Turn off Glove saving')
    parser.add_argument('--glove-dims', '-g', type=int, default=300, help='Number of GloVe dimensions. Can be 50, 100, 200, or 300.')
    parser.add_argument('--srp-dims', '-s', type=int, default=640, help='Number of SRP dimensions.')
    parser.add_argument('--efdir', type=str, default=None,
                        help='Set an alternative base url for EF files, if not using the setting from local.yaml or ~/.htrc-config.yaml.')
    args = parser.parse_args()
    
    thread_no = args.threadno - 1 # Zero index.
    
    assert not (args.no_srp & args.no_glove)
    
    if args.efdir is not None:
        customizable_resolver.dir = args.efdir

    already_imported = already_imported_list(args.outdir)
    print("There are {} files already imported".format(len(already_imported)))
    filenames = pd.read_csv(args.idlist, low_memory = False)
    
    thread_name = "{}-of-{}_".format(thread_no + 1, args.totalthreads)

    already_seen_file = open(os.path.join(args.outdir, "already_completed_files{}.csv".format(thread_name)), "a")

    if not args.no_srp:
        hasher = SRP.SRP(args.srp_dims)
        out_SRP = SRP.Vector_file(os.path.join(args.outdir, thread_name + "SRP_chunks.bin"), dims=args.srp_dims, mode="w")
        
        def SRP_transform(f):
            return hasher.stable_transform(words = f['lowercase'], counts = f['count'], log = True, standardize = True)
    
    if not args.no_glove:
        wem_model = wem_loader('glove-wiki-gigaword-{}'.format(args.glove_dims))
        
        # Cross-ref with stoplist and drop stopped words
        from spacy.lang.en.stop_words import STOP_WORDS
        wem_vocab = set(wem_model.vocab.keys())
        wem_vocab = wem_vocab.difference(STOP_WORDS)
        
        out_glove = SRP.Vector_file(os.path.join(args.outdir, thread_name + "Glove_chunks.bin"), dims = args.glove_dims, mode="w")
        
        def WEM_transform(f):
            return transformations.chunk_to_wem(f, wem_model, vocab=wem_vocab, stop=False, log=True, min_ncount=10)

    books = 0
    last = None
    start_time = time.time()

    try:
        gen = yielder(filenames.htid, thread_no, args.totalthreads, 
                      chunk_size=args.chunksize, already_imported_list=already_imported)
        for i, (id, chunk, start, end, group) in enumerate(gen):
            # Count books too.
            if last != id:
                books += 1
                if last is not None:
                    already_seen_file.write("{}\n".format(last))
                    if (books % 25 == 0):
                        rate = books/(time.time()-start_time)
                        print("{} books done on thread {} of {}, {:.02f} chunks per book, {:.02f} books per second".format(books, thread_no + 1, args.totalthreads, i/books, rate))
            last = id

            id = "{}-{:04d}-{}-{}".format(id, chunk, start, end)

            if not args.no_srp:
                SRP_rep = SRP_transform(group)
                out_SRP.add_row(id, SRP_rep)

            if not args.no_glove:
                WEM_rep = WEM_transform(group)
                if WEM_rep.shape[0] != args.glove_dims:
                    print(WEM_rep.shape, args.glove_dims)
                try:
                    out_glove.add_row(id, WEM_rep.astype('<f4'))
                except:
                    print(id, WEM_rep.shape, args.glove_dims, wem_model.vector_size)
                    raise
                    
        already_seen_file.write("{}\n".format(last))

        if not args.no_srp:
            out_SRP.close()
        if not args.no_glove:
            out_glove.close()

    except KeyboardInterrupt:
        if not args.no_srp:
            out_SRP.close()
        if not args.no_glove:
            out_glove.close()
Esempio n. 11
0
 def test_dtype(self):
     hasher = SRP.SRP(6)
     hello_world = hasher.stable_transform("hello world", log=False)
     self.assertEqual(hello_world.dtype, np.float32)