Beispiel #1
0
def get_count(path, store=False):
    ''' Get tokencount information from a single doc, by path'''
    from htrc_features import FeatureReader
    max_char = 50
    logging.debug(path)
    vol = FeatureReader(path).first()
    logging.debug(vol)
    logging.debug(vol.id)
    logging.debug(vol.language)
    logging.debug(type(vol.language))
    tl = vol.tokenlist(pages=False, pos=False)
    if tl.empty:
        return tl
    else:
        tl = tl.reset_index('section')[['count']]
    tl.index = [trim_token(t, max_char) for t in tl.index.values]
    tl.index.names = ['token']
    tl['id'] = vol.id
    if type(vol.language) is list:
        tl['language'] = vol.language[0]
    else:
        tl['language'] = vol.language
    tl = tl.reset_index('token').set_index(['language', 'id',
                                            'token']).sort_index()
    logging.debug(tl)
    return tl
Beispiel #2
0
 def test_json_only_load(self, paths):
     path = paths[0]
     feature_reader = FeatureReader(path)
     json = next(feature_reader.jsons())
     assert type(json) == dict
     assert json['features']['pages'][7]['header']['tokenCount'] == 5
     assert json['features']['pages'][7]['body']['capAlphaSeq'] == 2
Beispiel #3
0
 def test_json_only_load(self, paths):
     path = paths[0]
     feature_reader = FeatureReader(path)
     json = next(feature_reader.jsons())
     assert type(json) == dict
     assert json['features']['pages'][7]['header']['tokenCount'] == 5
     assert json['features']['pages'][7]['body']['capAlphaSeq'] == 2
Beispiel #4
0
def create_corpus(ids, verbose=1):
    paths = download_vols(ids)
    filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths]

    if verbose:
        pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids))
        pbar = pbar.start()
        n = 0

    fr = FeatureReader(paths)
    corpus = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        vols = [executor.submit(process_pages, vol) 
                    for id_n, vol in enumerate(fr.volumes())]
        
        if verbose:
            for f in concurrent.futures.as_completed(vols):
                n += 1
                pbar.update(n)

        corpus = map(concurrent.futures.Future.result, vols)
        pbar.finish()
    corpus = list(corpus)
    
    c = corpus_fromlist(corpus, context_type='book')
    c = apply_stoplist(c, nltk_stop=True, freq=5)
    c.context_data[0]['book_label'] = filtered_ids

    return c
Beispiel #5
0
    def test_id_list_remote_load(self, ids, titles):
        feature_reader = FeatureReader(ids=ids)
        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == titles[i]
Beispiel #6
0
 def test_big_pages(self):
     ''' Test a document with *many* tokens per page. '''
     path = os.path.join('tests', 'data',
                         'aeu.ark+=13960=t1rf63t52.json.bz2')
     feature_reader = FeatureReader(path)
     volume = feature_reader.first()
     tokenlist = volume.tokenlist()
     assert tokenlist.shape[0] == 56397
Beispiel #7
0
    def test_list_load(self, paths):
        feature_reader = FeatureReader(paths)
        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == self.TITLES[i]
    def test_list_load(self, paths):
        feature_reader = FeatureReader(paths)
        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == self.TITLES[i]
Beispiel #9
0
    def test_compress_error(self, paths):
        feature_reader = FeatureReader(paths, compressed=False)
        with pytest.raises(ValueError):
            next(feature_reader.volumes())

        paths = [path.replace('.bz2', '') for path in paths]
        feature_reader = FeatureReader(paths, compressed=True)
        with pytest.raises(IOError):
            next(feature_reader.volumes())
Beispiel #10
0
    def test_parquet_reading(self, ids, titles):
        dirpath = os.path.join('tests', 'data', 'partialparq')
        feature_reader = FeatureReader(ids=ids, format='parquet', dir=dirpath)

        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == titles[i]
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='Path to document to parse', nargs='*')
    parser.add_argument('-f',
                        '--frame-size',
                        default=10,
                        type=int,
                        help='Number of pages to use in sliding frame')
    parser.add_argument('-o',
                        '--outpath',
                        default="tmp",
                        type=unicode,
                        help='Output directory')
    args = parser.parse_args()

    append = False
    frame = []

    for volinput in args.input:
        # Get a list of json.bz2 files to read
        freader = FeatureReader(volinput)
        vol = freader.next()

        # Remove special characters from title. This will allow us to name a file after it
        clean_id = ''.join([char for char in vol.title if char.isalnum()])

        # Open files for training (Doc=1 page) and inference (Doc=sliding set of pages)
        tfile = open(
            os.path.join(args.outpath, 'train-{}.txt'.format(clean_id)),
            'w+' if not append else 'a')
        inferfile = open(
            os.path.join(args.outpath, 'infer-{}.txt'.format(clean_id)),
            'w+' if not append else 'a')

        for page in vol.pages():
            all_terms = explode_terms(page)

            # Data cleaning
            all_terms = [clean(term) for term in all_terms]
            all_terms = [term for term in all_terms if term]

            # Make into string
            pagetxt = " ".join(all_terms)
            frame += [pagetxt]
            while len(frame) > args.frame_size:
                frame = frame[1:]
            tfile.write('page{0} page{0} {1}\n'.format(page.seq, pagetxt))
            inferfile.write('pages{0}to{1} pages{0}to{1} {2}\n'.format(
                page.seq + 1 - len(frame), page.seq, " ".join(frame)))
        tfile.close()
        inferfile.close()
Beispiel #12
0
    def test_internal_tokencount_representation(self, paths):
        paths = paths[0]
        feature_reader = FeatureReader(paths, compression=None)
        vol = next(feature_reader.volumes())

        assert vol._tokencounts.empty
        vol.tokenlist()
        assert vol._tokencounts.index.names == [
            'page', 'section', 'token', 'pos'
        ]
        vol.tokenlist(case=False)
        assert vol._tokencounts.index.names == [
            'page', 'section', 'token', 'pos'
        ]
Beispiel #13
0
 def test_caching(self, paths):
     import time
     # Load new volume specifically for this test
     paths = paths[0]
     feature_reader = FeatureReader(paths, compression=None)
     vol = next(feature_reader.volumes())
     # Systems are different, the rough test here simply checks whether
     # the first run is much slower than later runs.
     tokenlist_times = []
     for i in range(0, 6):
         start = time.time()
         vol.tokenlist()
         passed = time.time() - start
         tokenlist_times.append(passed)
     assert 2 * tokenlist_times[0] > sum(tokenlist_times[1:])
Beispiel #14
0
def read_ids(metadata, folder, df):
    '''This function reads in the list of ids scraped from Hathi Trust and the folder destination. It gets the volume and tokenlist from Hathi Trust, and then calls spread table which separates out by page all tokens.'''
    directory = os.path.dirname(folder)
    if not os.path.exists(directory):
        os.makedirs(directory)
    if df:
        md = pd.read_csv(metadata)
        volids = md['vol_id'].tolist()
        # volids = [ vol for vol in volids if vol != 'uva.x030697132']
        # idx = volids.index('uva.x030697132') # for '../data_sources/hathi_trust_metadatas/middle_east_news_economic_weekly_1964_1973_008564927.csv' remove volume that is throwing error
        # volids = volids[idx+1:]
        volids = volids[:-1]
        fr = FeatureReader(ids=volids)
        for vol in fr:
            # print(vol.title, vol.id, vol.pub_date)
            row = md.loc[md['vol_id'] == vol.id].copy()
            # print(row, vol.title)
            title = vol.title.lower().split(' ')
            # # title = title.lower().split(" ")
            title = "_".join(title)+'_'+str(row.volume.values[0])+'_'+str(row.date.values[0])
            # title = 'tricontinental_bulletin_'+str(row.volume.values[0])+'_'+str(row.date.values[0])
            
            print(title)
            
            title= folder+title
            file_name = title + '.csv'
            print(file_name)
            a = vol.tokenlist(pos=False, case=False, section='all')

            file_name = file_name
            a.to_csv(file_name)
            spread_table(title, file_name)
    else:
        text_file = open(metadata, "r")
        volids = text_file.read().split('\n')
        # volids = [vol for vol in volids if len(vol) > 0]
        # volids = volids[:-1]
        # volids = volids[0:3]
        
        fr = FeatureReader(ids=volids)
        print(len(fr))
        for idx, vol in enumerate(fr):
            print(idx)
            if idx == 2:
                break
            else:
                title = vol.title + ' ' + vol.pub_date
                print(vol.pub_date, vol.title, vol.id)
Beispiel #15
0
def make_hashes(vocab, ids=None, paths=None, **kwargs):
    if ids and paths:
        raise "Can't include both ids and paths"
    elif ids:
        fr = FeatureReader(ids=ids)
    elif paths:
        fr = FeatureReader(paths=paths)
    else:
        raise "Need either a list of ids or paths"

    i = 0
    for vol in fr.volumes():
        tokens = set(vol.tokens()).intersection(vocab)
        i += 1
        if i % 100 == 0:
            print(os.getpid(), i, 'files processed')
        yield make_hash(vol.id, tokens, **kwargs)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input',
                       help='Path to document to parse',
                       nargs='*')
    parser.add_argument('-f', '--frame-size', default=10, type=int,
                       help='Number of pages to use in sliding frame')
    parser.add_argument('-o', '--outpath', default="tmp", type=unicode,
                       help='Output directory')
    args = parser.parse_args()

    append = False 
    frame = []

    for volinput in args.input:
        # Get a list of json.bz2 files to read
        freader = FeatureReader(volinput)
        vol = freader.next()

        # Remove special characters from title. This will allow us to name a file after it
        clean_id = ''.join([char for char in vol.title if char.isalnum()])
        
        # Open files for training (Doc=1 page) and inference (Doc=sliding set of pages)
        tfile = open(os.path.join(args.outpath, 'train-{}.txt'.format(clean_id)), 'w+' if not append else 'a')
        inferfile = open(os.path.join(args.outpath, 'infer-{}.txt'.format(clean_id)), 'w+' if not append else 'a')

        for page in vol.pages():
            all_terms = explode_terms(page)
            
            # Data cleaning
            all_terms = [clean(term) for term in all_terms]
            all_terms = [term for term in all_terms if term]

            # Make into string
            pagetxt = " ".join(all_terms)
            frame += [pagetxt]
            while len(frame) > args.frame_size:
                frame = frame[1:]
            tfile.write('page{0} page{0} {1}\n'.format(page.seq, pagetxt))
            inferfile.write('pages{0}to{1} pages{0}to{1} {2}\n'.format(page.seq+1-len(frame), 
                                                       page.seq, 
                                                       " ".join(frame)))
        tfile.close()
        inferfile.close()
Beispiel #17
0
def main():
    # Get a list of json.bz2 files to read
    paths = glob.glob('data/*.json.bz2')
    paths = paths[0:4]  # Truncate list for example

    # Open file for writing results
    f = bz2.BZ2File('term_volume_counts.bz2', "w")

    # Start a feature reader with the paths and pass the mapping function
    feature_reader = FeatureReader(paths)
    results = feature_reader.multiprocessing(get_term_volume_counts)

    # Save the results
    for vol, result in results:
        for t, c in result.iteritems():  # result.items() in python3
            s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1], t, c)
            f.write(s.encode('UTF-8'))  # For python3, use str(s)

    f.close()
def old():
    # Get a list of json.bz2 files to read
    paths = glob.glob('data/*.json.bz2')
    paths = paths[0:4] # Truncate list for example

    # Open file for writing results
    f = bz2.BZ2File('term_volume_counts.bz2', "w")

    # Start a feature reader with the paths and pass the mapping function
    feature_reader = FeatureReader(paths)
    results = feature_reader.multiprocessing(get_term_volume_counts)

    # Save the results
    for vol, result in results:
        for t,c in result.iteritems(): # result.items() in python3
            s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1],t,c)
            f.write(s.encode('UTF-8')) # For python3, use str(s)

    f.close()
Beispiel #19
0
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1):
    paths = download_vols(ids)
    filtered_ids = [
        os.path.basename(p).replace('.json.bz2', '') for p in paths
    ]

    if verbose:
        pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids))
        pbar = pbar.start()
        n = 0

    if sys.version_info[0] == 2:
        TD = backports.tempfile.TemporaryDirectory
    else:
        TD = tempfile.TemporaryDirectory
    with TD(prefix='vsm-') as pickle_dir:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            fr = FeatureReader(paths)
            corpus = []
            with concurrent.futures.ProcessPoolExecutor() as executor:
                vols = [
                    executor.submit(process_pages, vol, pickle_dir)
                    for id_n, vol in enumerate(fr.volumes())
                ]

                if verbose:
                    for _ in concurrent.futures.as_completed(vols):
                        n += 1
                        pbar.update(n)

                pbar.finish()
                corpus_files = [vol.result() for vol in vols]

            corpus = [PickledWords(filename) for filename in corpus_files]

        c = corpus_fromlist(corpus, context_type='book')
        c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq)
        c.context_data[0]['book_label'] = filtered_ids

    return c
Beispiel #20
0
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1):
    paths = download_vols(ids)
    filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths]

    if verbose:
        pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids))
        pbar = pbar.start()
        n = 0

    if sys.version_info[0] == 2:
        TD = backports.tempfile.TemporaryDirectory 
    else:
        TD = tempfile.TemporaryDirectory
    with TD(prefix='vsm-') as pickle_dir:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            fr = FeatureReader(paths)
            corpus = []
            with concurrent.futures.ProcessPoolExecutor() as executor:
                vols = [executor.submit(process_pages, vol, pickle_dir) 
                            for id_n, vol in enumerate(fr.volumes())]
                
                if verbose:
                    for _ in concurrent.futures.as_completed(vols):
                        n += 1
                        pbar.update(n)

                pbar.finish()
                corpus_files = [vol.result() for vol in vols]

            corpus = [PickledWords(filename) for filename in corpus_files]
    
        c = corpus_fromlist(corpus, context_type='book')
        c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq)
        c.context_data[0]['book_label'] = filtered_ids

    return c
def generic_processor(map_func,
                      result_func,
                      paths,
                      outpath=None,
                      batch_size=1000):
    if outpath:
        f = bz2.BZ2File(outpath, "w")
    else:
        f = sys.stdout
    csvf = csv.writer(f)
    n = 0
    m = math.ceil(float(len(paths)) / batch_size)

    logging.info("Script started")

    while (True):
        start = time.time()
        batch, paths = (paths[:batch_size], paths[batch_size:])
        n += 1
        logging.info("Starting batch {0}/{1}".format(n, m))
        feature_reader = FeatureReader(batch)

        results = feature_reader.multiprocessing(map_func)
        result_func(results, csvf)

        logging.info("Batch of {0} volumes finished in in {1}s".format(
            len(batch),
            time.time() - start))

        if outpath:
            logging.debug("Output filesize is currently: {0}Gb".format(
                os.stat(outpath).st_size / (1024**3)))

        if len(paths) == 0:
            break

    logging.info("Script done")
    f.close()
Beispiel #22
0
    def test_compress_error(self, paths):
        feature_reader = FeatureReader(paths, compressed=False)
        with pytest.raises(ValueError):
            next(feature_reader.volumes())

        paths = [path.replace('.bz2', '') for path in paths]
        feature_reader = FeatureReader(paths, compressed=True)
        with pytest.raises(IOError):
            next(feature_reader.volumes())
Beispiel #23
0
def read_collections(metadata):

    md = pd.read_table(metadata)
    ids = md['htitem_id'].tolist()
    # print(ids)

    fr = FeatureReader(ids=ids)
    for vol in fr:
        row = md.loc[md['htitem_id'] == vol.id].copy()
        title = row['title'].values[0]
        title = title.lower().replace('.', '').split(" ")
        title = "_".join(title)
        print(title)
        file_name = title + '.csv'
        a = vol.tokenlist(pos=False, case=False, section='all')
        a.to_csv(file_name)
        spread_table(title, file_name)
def read_collections(metadata):
    
    md = pd.read_table(metadata)
    ids = md['htitem_id'].tolist()
    print(ids)
    volids = workset.load_hathitrust_collection('https://babel.hathitrust.org/cgi/mb?a=listis;c=1885558567')
    print(volids)
    https://babel.hathitrust.org/cgi/mb?a=listis;c=648138425
    # print('inu.32000013025095', 'inu.32000013025087', 'inu.32000013025079', 'inu.32000013025061', 'inu.32000013025053', 'inu.32000013025046', 'mdp.39015056038089', 'inu.32000013024635', 'inu.32000013024627', 'mdp.39015056038071', 'mdp.39015056038246', 'mdp.39015056038253', 'inu.32000013025194', 'inu.32000013025160', 'mdp.39015056038063', 'mdp.39015056038238', 'mdp.39015056038402', 'mdp.39015056038410', 'inu.32000013025152', 'inu.32000013025145', 'inu.32000013025137', 'inu.32000013025129', 'mdp.39015056038220', 'mdp.39015056038386', 'mdp.39015056038394', 'inu.32000013025111', 'inu.32000013025103', 'mdp.39015056038212', 'mdp.39015056038378', 'mdp.39015056038204', 'mdp.39015056038352', )
    # volids= ['uva.x030696874', 'uva.x030696873', 'uva.x030696872', 'uva.x030696871', 'uva.x030696870', 'uva.x030696869', 'uva.x030696867', 'uva.x030696866', 'uva.x030696865', 'uva.x030696864', 'uva.x030696833', 'uva.x030696834', 'uva.x030696835', 'uva.x030696836', 'uva.x030696837', 'uva.x030696838', 'uva.x030696839', 'uva.x030696840', 'uva.x030696841', 'uva.x030696843', 'uva.x030696844', 'uva.x030696845', 'uva.x030696848', 'uva.x030696849', 'uva.x030696850', 'uva.x030696851', 'uva.x030696852', 'uva.x030696854', 'uva.x030696855', 'uva.x030696856', 'uva.x030696857', 'uva.x030696859', 'uva.x030696858', 'uva.x030696860', 'uva.x030696861', 'uva.x030696862', 'uva.x030696863', 'uva.x030696876', 'uva.x030696877', 'uva.x030696878', 'uva.x030696879', 'uva.x030696880', 'uva.x030696881', 'uva.x030696882', 'uva.x030696883', 'uva.x030696884', 'uva.x030696885', 'uva.x030696886', 'uva.x030696887', 'uva.x030696888', 'uva.x030696889', 'uva.x030696890', 'uva.x030696891', 'uva.x030696892', 'uva.x030696895', 'uva.x030696896', 'uva.x030696897', 'uva.x030696898', 'uva.x030696899', 'uva.x030696900', 'uva.x030696901', 'uva.x030696902', 'uva.x030696903', 'uva.x030696904', 'inu.30000081508032', 'inu.30000122990637', 'uva.x030696905', 'uva.x030696906', 'uva.x030696907', 'uva.x030696908', 'uva.x030696909', 'uva.x030696910', 'uva.x030696911', 'uva.x030696912', 'uva.x030696913', 'uva.x030696914', 'uva.x030696915', 'uva.x030696916', 'uva.x030696917', 'uva.x030696918', 'uva.x030696919', 'uva.x030696920', 'uva.x030696921', 'uva.x030696923', 'uva.x030696924', 'uva.x030696926', 'uva.x030696927', 'uva.x030696928', 'uva.x030696929', 'uva.x030696930', 'uva.x030696931', 'uva.x030696932', 'uva.x030696934', 'uva.x030696935', 'uva.x030696936', 'uva.x030696937', 'uva.x030696938', 'uva.x030696940', 'uva.x030696941', 'uva.x030696942', 'uva.x030696943', 'uva.x030696944', 'uva.x030696945', 'uva.x030696946', 'uva.x030696947', 'uva.x030696948', 'uva.x030696949', 'uva.x030697028', 'uva.x030697029', 'uva.x030697030', 'uva.x030697031', 'uva.x030697032', 'uva.x030697033', 'uva.x030697034', 'uva.x030697035', 'inu.30000081508123', 'inu.30000081508115', 'inu.30000122990629']
    volids = ['uva.x030696874', 'uva.x030696873', 'uva.x030696872']
    fr = FeatureReader(ids=volids)
    for vol in fr:
        row = md.loc[md['htitem_id'] == vol.id].copy()
        title = row['title'].values[0]
        title = title.lower().replace('.', '').split(" ")
        title = "_".join(title)
        file_name = title + '.csv'
        a = vol.tokenlist(pos=False, case=False, section='all')
        a.to_csv(file_name)
        spread_table(title, file_name)
Beispiel #25
0
def read_collections(metadata, folder):
    '''This function reads in the metadata of a collection created on Hathi Trust and the folder destination. It gets the volume and tokenlist from Hathi Trust, and then calls spread table which separates out by page all tokens.'''

    directory = os.path.dirname(folder)
    if not os.path.exists(directory):
        os.makedirs(directory)
    md = pd.read_csv(metadata,sep='\t')
    volids = md['htitem_id'].tolist()
    print(volids)
    fr = FeatureReader(ids=volids)
    for vol in fr:
        row = md.loc[md['htitem_id'] == vol.id].copy()
        title = row['title'].values[0]
        print(title)
        # title = title.lower().replace('.', '').replace('&', 'and').replace('/', ' ').replace('-', ' ').split(" ")
        name = title.lower().split(':')[0].split(' ')
        dates = "_".join(title.split(' ')[-3:])
        title= folder+"_".join(name)+dates
        print(title)
        file_name = title + '.csv'
        # print(file_name, folder)
        a = vol.tokenlist(pos=False, case=False, section='all')
        a.to_csv(file_name)
        spread_table(title, file_name)
Beispiel #26
0
 def test_id_remote_load(self, ids):
     id = ids[0]
     feature_reader = FeatureReader(ids=id)
     vol = next(feature_reader.volumes())
     assert type(vol) == htrc_features.feature_reader.Volume
Beispiel #27
0
 def test_first(self, paths):
     feature_reader = FeatureReader(paths)
     vol = feature_reader.first()
     assert type(vol) == htrc_features.feature_reader.Volume
     assert vol.title == self.TITLES[0]
Beispiel #28
0
 def test_iteration(self, paths):
     feature_reader = FeatureReader(paths)
     for vol in feature_reader:
         assert type(vol) == htrc_features.feature_reader.Volume
     for vol in feature_reader.volumes():
         assert type(vol) == htrc_features.feature_reader.Volume
Beispiel #29
0
import glob
import pandas as pd
from htrc_features import FeatureReader
import csv
with open('C:\\hathi\\Students.csv', 'r') as f:
    reader = csv.reader(f)
    hathi_list = list(reader)

for i in hathi_list:
    try:
        print('Grabbing tokenlist for: ', i)
        fr = FeatureReader(ids=i)
        for vol in fr:
            tokens = vol.tokenlist()
            tokens.to_dict()
            #matches = tokens['count'] > 10
            #tokens[matches].sample(100)
            filename = 'C:\\hathi\\' + str(i) + '.txt'
            file = open(filename, 'w')
            file.write(str(tokens))
    except:
        pass




Beispiel #30
0
def volume(paths):
    paths = paths[0]
    feature_reader = FeatureReader(paths, compression=None)
    return next(feature_reader.volumes())
Beispiel #31
0
 def test_uncompressed(self, paths, titles):
     paths = [path.replace('.bz2', '') for path in paths]
     feature_reader = FeatureReader(paths, compressed=False)
     for i, vol in enumerate(feature_reader):
         assert type(vol) == htrc_features.feature_reader.Volume
         assert vol.title == titles[i]
Beispiel #32
0
 def test_single_path_load(self, paths):
     path = paths[0]
     feature_reader = FeatureReader(path)
     vol = next(feature_reader.volumes())
     assert type(vol) == htrc_features.feature_reader.Volume
Beispiel #33
0
 def test_first(self, paths, titles):
     feature_reader = FeatureReader(paths)
     vol = feature_reader.first()
     assert type(vol) == htrc_features.feature_reader.Volume
     assert vol.title == titles[0]
Beispiel #34
0
 def test_iteration(self, paths):
     feature_reader = FeatureReader(paths)
     for vol in feature_reader:
         assert type(vol) == htrc_features.feature_reader.Volume
     for vol in feature_reader.volumes():
         assert type(vol) == htrc_features.feature_reader.Volume
Beispiel #35
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar  7 19:47:13 2019

@author: kashish
"""

from htrc_features import FeatureReader

import json

path = ['data.json']

f = FeatureReader(path)
with open('data.json') as f:
    data = json.load(f)
Beispiel #36
0
#     df.to_csv('./htrc_pages/'+str(page)+'_'+title+'.csv')
# test = pd.read_csv('term_page_freq.csv')
# print(test[0:2])
# result = [i for i in glob.glob('*.{}'.format('csv'))]
# # final_df = pd.DataFrame(columns=['page', 'lowercase', 'counts'], index=None)
# output_path = 'final_htrc.csv'
# for filename in result:
#     if os.path.exists(output_path):
#         df = pd.read_csv(filename, index_col=False)
#         df.to_csv(output_path, mode='a', header=False, index=False)
#     else:
#         df = pd.read_csv(filename, index_col=False)
#         df.to_csv(output_path, header=True, index=False)
volids = workset.load_hathitrust_collection(
    'https://babel.hathitrust.org/cgi/mb?a=listis&c=648138425')
fr = FeatureReader(ids=volids)
# # print(fr)
# fr = FeatureReader(ids=["inu.30000125592232"])
# # # # print(fr)

# # # # # final_df = pd.DataFrame(columns=['page', 'character', 'frequency'], index=None)
# # # # output_path = 'htrc_test.csv'
for index, vol in enumerate(fr):
    # #     a = vol.tokens_per_page()
    # #     print(a)
    # a = vol.tokenlist(pos=False, case=False, section='all')
    # a.to_csv(vol.title + str(index) + '_vol_page_freq.csv')
    # print("Volume %s is a %s page text written in %s. You can doublecheck at %s" % (vol.id, vol.page_count, vol.language, vol.handle_url))
    # print(vol.metadata)
    print(vol.title)
# #     print(vol.metadata['published'][0])
Beispiel #37
0
 def test_single_path_load(self, paths):
     path = paths[0]
     feature_reader = FeatureReader(path)
     vol = next(feature_reader.volumes())
     assert type(vol) == htrc_features.feature_reader.Volume
Beispiel #38
0
def get_token_counts(vol,hat,tail):
    df_tl = vol.tokenlist().reset_index()# convert to dataframe
    df_tl = df_tl[df_tl['section']=='body']#get rid of header and footer; keep only body
    page_count=df_tl['page'].tolist()[-1]# get total page number
    page_hat=round(page_count*hat)# find the 15% page
    page_tail=page_count-round(page_count*tail)# find the "counter-5%" page
    df_tl=df_tl[df_tl['page'].between(page_hat, page_tail, inclusive=False)] # locate the pages in between
    series_tl=df_tl.groupby(["token"]).size()# group the tokens across pages
    new_df_tl = series_tl.to_frame().reset_index() # convert to df
    return new_df_tl

docfreqs = Counter()
termfreqs = dict()
ctr = 0

fr = FeatureReader(paths)
for vol in fr.volumes():
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)

    output = get_token_counts(vol,0.15,0.05)
    docid = str(vol.id)

    thesewords = Counter()

    for row in output.itertuples(index = False):
        if pd.isnull(row[0]):
            continue
        word = row[0].lower().strip('.",')
Beispiel #39
0
import tensorflow as tf
import pandas as pd
from htrc_features import FeatureReader, utils
import itertools
import glob
from ef_utils import *

ef_root = "data/ef-files/comedy/"
ef_file_paths = glob.glob(ef_root + "/*.bz2")
ef_files = FeatureReader(paths=list(ef_file_paths))

token_ref = load_tokenref('eng-vocab-1.txt.bz2', trim_head=0)

volumes = ef_files.volumes()

i = 0
writer = tf.python_io.TFRecordWriter(
    'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100))

for vol in volumes:
    i += 1
    if i % 100 == 0:
        writer.close()
        writer = tf.python_io.TFRecordWriter(
            'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100))

    print(vol.id)
    pages_en = [p for p in vol.pages() if {'en': '1.00'} in p.languages]
    for page in pages_en:
        page_body_tokens = page.tokenlist(section='body',
                                          case=False,