Python get_stoplist_tokensの例

プログラミング言語: Python

名前空間/パッケージ名: tesserae.utils.stopwords

メソッド/関数: get_stoplist_tokens

hotexamples.comのコード掲載数: 4

Python get_stoplist_tokens - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtesserae.utils.stopwords.get_stoplist_tokensの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def test_latin_trigrams(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_latin_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            print(n)
            n = np.asarray(n)
            print('array', n)
            print(np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'latin')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False

コード例 #2

ファイルを表示

def test_greek_trigrams(minipop, mini_greek_metadata):
    """
    For the purpose of visualization.
    Use to confirm that trigrams are being stored in the database correctly.
    It should be noted that v5 results do not have stopwords filtered out,
    while v3 results probably do.
    """
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_greek_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            #            print(n)
            n = np.asarray(n)
            #            print('array',n)
            #            print('shape', np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'greek')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False

コード例 #3

ファイルを表示

ファイル: search.py プロジェクト: lhambrid/tesserae-v5

def main():
    """Perform Tesserae search and display the top 10 results"""
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ')
    source_title = args.source_title.lower().replace('-', ' ')
    source = TextOptions(text=connection.find('texts',
                                              author=source_author,
                                              title=source_title)[0],
                         unit_type=args.source_unit)
    target_author = args.target_author.lower().replace('_', ' ')
    target_title = args.target_title.lower().replace('_', ' ')
    target = TextOptions(text=connection.find('texts',
                                              author=target_author,
                                              title=target_title)[0],
                         unit_type=args.target_unit)

    start = time.time()
    stopword_indices = create_stoplist(
        connection,
        args.n_stopwords,
        args.feature,
        source.text.language,
        basis='corpus' if args.stopword_basis == 'corpus' else
        [source.text.id, target.text.id])
    stopword_tokens = get_stoplist_tokens(connection, stopword_indices,
                                          args.feature, source.text.language)
    parameters = {
        'source': {
            'object_id': str(source.text.id),
            'units': source.unit_type
        },
        'target': {
            'object_id': str(target.text.id),
            'units': target.unit_type
        },
        'method': {
            'name': SparseMatrixSearch.matcher_type,
            'feature': args.feature,
            'stopwords': stopword_tokens,
            'freq_basis': args.freq_basis,
            'max_distance': args.max_distance,
            'distance_basis': args.distance_basis
        }
    }
    results_id = check_cache(connection, parameters['source'],
                             parameters['target'], parameters['method'])
    if results_id:
        print('Cached results found.')
        search = connection.find(Search.collection,
                                 results_id=results_id,
                                 search_type=NORMAL_SEARCH)[0]
    else:
        search = Search(results_id=uuid.uuid4().hex,
                        search_type=NORMAL_SEARCH,
                        parameters=parameters)
        connection.insert(search)
        search_params = {
            'source': source,
            'target': target,
            'feature': parameters['method']['feature'],
            'stopwords': parameters['method']['stopwords'],
            'freq_basis': parameters['method']['freq_basis'],
            'max_distance': parameters['method']['max_distance'],
            'distance_basis': parameters['method']['distance_basis'],
            'min_score': 0
        }
        _run_search(connection, search, SparseMatrixSearch.matcher_type,
                    search_params)
    matches = get_results(connection, search.id, PageOptions())
    end = time.time() - start
    matches.sort(key=lambda x: x['score'], reverse=True)
    print(f'Search found {len(matches)} matches in {end}s.')
    display_count = 10 if len(matches) >= 10 else len(matches)
    print(f'The Top {display_count} Matches')
    print('------------------')
    print()
    print("Result\tScore\tSource Locus\tTarget Locus\tShared")
    for i, m in enumerate(matches[:10]):
        shared = m['matched_features']
        print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t'
              f'{[t for t in shared]}')

コード例 #4

ファイルを表示

ファイル: stopwords.py プロジェクト: tesserae/apitess

def query_stopwords():
    """Build a stopwords list"""
    if len(flask.request.args) == 0:
        # default response when no arguments are given
        return flask.jsonify({'stopwords': []})

    feature = flask.request.args.get('feature', 'lemmata')
    list_size = flask.request.args.get('list_size', 10)
    try:
        list_size = int(list_size)
    except ValueError:
        return apitess.errors.error(
            400,
            data={k: v
                  for k, v in flask.request.args.items()},
            message='"list_size" must be an integer')

    # language takes precedence over works
    language = flask.request.args.get('language', None)
    if language:
        stopword_indices = create_stoplist(flask.g.db, list_size, feature,
                                           language)
        if len(stopword_indices) == 0:
            return apitess.errors.error(
                400,
                data={k: v
                      for k, v in flask.request.args.items()},
                message='No stopwords found for feature "{}" in language "{}".'
                .format(feature, language))
        return flask.jsonify({
            'stopwords':
            get_stoplist_tokens(flask.g.db, stopword_indices, feature,
                                language)
        })

    works = flask.request.args.get('works', None)
    if works:
        oids, fails = apitess.utils.parse_works_arg(works)
        if fails:
            return apitess.errors.bad_object_ids(fails, flask.request.args)
        text_results = flask.g.db.find(tesserae.db.entities.Text.collection,
                                       _id=oids)
        if len(text_results) != len(oids):
            # figure out which works were not found in the database and report
            found = {str(r.id) for r in text_results}
            not_found = []
            for obj_id in oids:
                if obj_id not in found:
                    not_found.append(obj_id)
                return apitess.errors.error(
                    400,
                    data={k: v
                          for k, v in flask.request.args.items()},
                    message=('The following works could not be found '
                             f'in the database: {not_found}'))
        stopword_indices = create_stoplist(
            flask.g.db,
            list_size,
            feature,
            text_results[0].language,
            basis=[str(t.id) for t in text_results])
        return flask.jsonify({
            'stopwords':
            get_stoplist_tokens(flask.g.db, stopword_indices, feature,
                                language)
        })

    # if we get here, then we didn't get enough information
    return apitess.errors.error(
        400,
        data={k: v
              for k, v in flask.request.args.items()},
        message=(
            'Insufficient information was given to calculate a stopwords '
            'list (Perhaps you forgot to specify "language" or "works").'))