Python Indexer.count_word 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: Indexer

클래스/타입: Indexer

메소드/함수: count_word

hotexamples.com에서의 예제들: 2

Python Indexer.count_word - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 Indexer.Indexer.count_word에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Indexer(30)

add_sentence(2)

add_word(2)

count_word(2)

__init__(2)

addToIndex(1)

dump(1)

tagcloud(1)

search(1)

indexer(1)

get_terms(1)

get_posting_list(1)

get_normalized_fequency(1)

extract_classes(1)

create_inverted_index(1)

document_frequency_normalized(1)

add(1)

count_word_in_text(1)

UserInterface(1)

compute_tdidf(1)

build_dictionary(1)

build_data_structure(1)

buidlindex(1)

add_sentences(1)

term_document_frequency(1)

예제 #1

파일 보기

class StanfordTwitterEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(StanfordTwitterEmbedding, self).__init__(device=device)
        self.path = Path(
            '../data/models/glove.twitter.27B/glove.twitter.27B.200d.txt')
        with_raw_file = False
        if with_raw_file:
            with self.path.open('r', encoding='utf-8-sig') as f:
                texts = f.readlines()
            headers = [len(texts), None]
            vocab, weights = map(
                list,
                zip(*Parallel(n_jobs=10)
                    ([delayed(self.get_weights)(text) for text in texts])))
            with (self.path.parent / 'vocab.pkl').open('wb') as f:
                pickle.dump(vocab, f)
            with (self.path.parent / 'weights.pkl').open('wb') as f:
                pickle.dump(weights, f)
        else:
            with (self.path.parent / 'vocab.pkl').open('rb') as f:
                vocab = pickle.load(f)
            with (self.path.parent / 'weights.pkl').open('rb') as f:
                weights = pickle.load(f)

        self.indexer = Indexer(special_tokens={
            '<s>': 0,
            '<unk>': 1,
            '<pad>': 2,
            '<\s>': 3,
            '<mask>': 4
        },
                               with_del_stopwords=self.with_del_stopwords)
        for word in vocab:
            self.indexer.count_word(word)
            self.indexer.add_word(word)
        self.embedding_dim = len(weights[0])
        special_weights = [[0.0] * self.embedding_dim] * 5
        weights = torch.FloatTensor(special_weights + weights)
        self.embedding = nn.Embedding.from_pretrained(
            embeddings=weights, padding_idx=self.indexer.padding_index)
        self.embedding.to(device)

    def get_weights(self, text):
        content = text.split(' ')
        return content[0], list(map(float, content[1:]))

예제 #2

파일 보기

class NtuaTwitterEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(NtuaTwitterEmbedding, self).__init__(device=device)
        self.path = Path('../data/models/ntua-slp-semeval2018/ntua_twitter_300.txt')
        with self.path.open('r', encoding='utf-8-sig') as f:
            texts = f.readlines()
        headers = texts[0].strip().split(' ')
        contents = [text.strip().split(' ') for text in texts[1:]]
        vocab = [content[0] for content in contents]
        weights = [list(map(float, content[1:])) for content in contents]
        self.indexer = Indexer(special_tokens={'<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4}, with_del_stopwords=self.with_del_stopwords)
        for word in vocab:
            self.indexer.count_word(word)
            self.indexer.add_word(word)
        self.embedding_dim = int(headers[1])
        special_weights = [[0.0] * self.embedding_dim] * 5
        weights = torch.FloatTensor(special_weights + weights)
        self.embedding = nn.Embedding.from_pretrained(embeddings=weights, padding_idx=self.indexer.padding_index)
        self.embedding.to(device)